In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForCausalLM, GPT2Tokenizer, GPT2LMHeadModel
import torch

# Function to encode categorical data
def encode_categorical_columns(df):
    encoders = {}
    encoded_columns = {}
    
    for column in df.select_dtypes(include=['object']).columns:
        encoder = LabelEncoder()
        df[column] = encoder.fit_transform(df[column])
        encoders[column] = encoder
        encoded_columns[column] = list(encoder.classes_)
        
        print(f"Column '{column}' has been encoded. Classes: {list(encoder.classes_)}")
        
    return df, encoders, encoded_columns

# Function to plot the data
def plot_data(df, plot_types):
    plot_types = [pt.strip() for pt in plot_types.split(',')]
    columns = df.columns
    
    for col1 in columns:
        for col2 in columns:
            if col1 != col2:
                fig, axes = plt.subplots(nrows=1, ncols=len(plot_types), figsize=(15, 5))
                fig.suptitle(f'{col1} vs {col2}')
                
                if len(plot_types) == 1:
                    axes = [axes]
                
                for ax, plot_type in zip(axes, plot_types):
                    if plot_type == 'bar':
                        df.groupby([col1, col2]).size().unstack().plot(kind='bar', stacked=True, ax=ax)
                    elif plot_type == 'box':
                        df[[col1, col2]].plot(kind='box', ax=ax)
                    elif plot_type == 'scatter':
                        df.plot(kind='scatter', x=col1, y=col2, ax=ax)
                    elif plot_type == 'line':
                        df.plot(kind='line', x=col1, y=col2, ax=ax)
                    elif plot_type == 'hist':
                        df[[col1, col2]].plot(kind='hist', ax=ax)
                    ax.set_title(f'{plot_type.capitalize()} plot')
                
                plt.tight_layout()
                plt.show()

# Function to identify outliers
def find_outliers(df):
    outliers = {}
    for col in df.select_dtypes(include=[np.number]).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outliers[col] = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)][col].tolist()
    return outliers

# Function to convert dataframe to sentences
def convert_to_sentences(data):
    sentences = []
    for index, row in data.iterrows():
        sentence = ", ".join([f"{col} is {val}" for col, val in row.items()])
        sentences.append(sentence)
    return " ".join(sentences)

# Function to process text with CodeParrot
def process_with_codeparrot(text):
    tokenizer = AutoTokenizer.from_pretrained("codeparrot/codeparrot-small")
    model = AutoModelForCausalLM.from_pretrained("codeparrot/codeparrot-small")
    
    
    tokenizer.pad_token = tokenizer.eos_token
    
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = model.generate(inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=100)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Function to process text with GPT-2
def process_with_gpt2(text):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    
    
    tokenizer.pad_token = tokenizer.eos_token
    
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = model.generate(inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=100)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Function to perform detailed analysis and prompt engineering
def detailed_analysis(df, codeparrot_text, gpt2_text):
    numeric_df = df.select_dtypes(include=[np.number])
    stats = {
        'mean': numeric_df.mean(),
        'median': numeric_df.median(),
        'mode': numeric_df.mode().iloc[0],
        'std_dev': numeric_df.std(),
        'variance': numeric_df.var(),
        'skewness': numeric_df.skew(),
        'kurtosis': numeric_df.kurt(),
        'min': numeric_df.min(),
        'max': numeric_df.max(),
        'range': numeric_df.max() - numeric_df.min(),
        '25%': numeric_df.quantile(0.25),
        '50% (median)': numeric_df.quantile(0.50),
        '75%': numeric_df.quantile(0.75),
        'IQR': numeric_df.quantile(0.75) - numeric_df.quantile(0.25),
        'correlation': numeric_df.corr()
    }
    
    analysis_text = "\n".join([f"{key}:\n{value}\n" for key, value in stats.items()])
    outliers = find_outliers(df)
    
    prompt = (
        f"The dataset contains information related to [Brief Description of the Dataset's Domain]. "
        f"Here is a detailed analytical summary of the data based on the following statistical analysis:\n\n{analysis_text}\n"
        f"Outliers:\n{outliers}\n\n"
        f"Please provide a detailed summary including:\n"
        f"- Structure of the dataset (columns and their meanings)\n"
        f"- Sample data points (5 rows for example)\n"
        f"- Key insights derived from the statistical analysis (relationships between variables, distributions, etc.)\n"
        f"- Any notable trends or patterns in the data\n"
        f"- Unusual or unexpected findings (anomalies)\n"
        f"- Actionable recommendations or potential areas for further investigation based on the analysis"
    )
    
    combined_text = codeparrot_text + " " + gpt2_text
    
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    
    # Ensure pad token is set
    tokenizer.pad_token = tokenizer.eos_token
    
    inputs = tokenizer(prompt + combined_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model.generate(inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=100)
    detailed_description = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return analysis_text, detailed_description

def main(file_path):
    df = pd.read_csv(file_path)
    
    # Encoding categorical columns
    df, encoders, encoded_columns = encode_categorical_columns(df)
    
    # Asking user for plot types
    plot_types = input("Enter the types of plots you want (e.g., histogram, scatter, line, bar, box), separated by commas: ")
    
    # Plotting the data
    plot_data(df, plot_types)
    
    # Convert dataframe to sentences
    text_data = convert_to_sentences(df)
    
    # Process text with CodeParrot
    codeparrot_output = process_with_codeparrot(text_data)
    print("CodeParrot Output:")
    print(codeparrot_output)
    
    # Process text with GPT-2
    gpt2_output = process_with_gpt2(codeparrot_output)
    print("GPT-2 Output:")
    print(gpt2_output)
    
    # Performing detailed analysis
    analysis, description = detailed_analysis(df, codeparrot_output, gpt2_output)
    print("Detailed Description:\n")
    print(description)
    print("\nEncoded Columns:\n")
    for col, classes in encoded_columns.items():
        print(f"{col}: {classes}")

# Example file path
file_path = 'test_data.csv'
main(file_path)