In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import anderson, shapiro, ttest_ind, chi2_contingency, linregress
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import statsmodels.api as sm

class DataAnalysis:
    def __init__(self):
        self.df = None
        self.column_types = {}
      
    def dataset_loading(self, file_path):
        """Load CSV file and display statistics automatically."""
        self.df = pd.read_csv(file_path)
        self.column_types = self.list_column_types()           
        self.variable_statistics()  # Display variable statistics after loading the dataset

    def list_column_types(self):
        """List the type of each column."""
        column_types = {}
        for col in self.df.columns:
            if pd.api.types.is_numeric_dtype(self.df[col]):
                column_types[col] = "numeric"
            elif pd.api.types.is_categorical_dtype(self.df[col]):
                column_types[col] = "categorical"
            else:
                column_types[col] = "object"
        return column_types

    def variable_statistics(self):
        """Output statistical information for each variable."""
        summary_data = []
        for col in self.df.columns:
            # Only calculate statistics for numeric columns
            if pd.api.types.is_numeric_dtype(self.df[col]):
                mean_val = self.df[col].mean()
                median_val = self.df[col].median()
                mode_val = self.df[col].mode()[0]
                kurtosis_val = self.df[col].kurt()
                skewness_val = self.df[col].skew()
            else:
                mean_val = "NA"
                median_val = "NA"
                mode_val = self.df[col].mode()[0]
                kurtosis_val = "NA"
                skewness_val = "NA"

            summary_data.append({
                "Variable": col,
                "Mean / Median / Mode": f"{mean_val} / {median_val} / {mode_val}",
                "Kurtosis": kurtosis_val,
                "Skewness": skewness_val
            })

        summary_df = pd.DataFrame(summary_data)
        pd.set_option('display.max_columns', None)  # Display all columns
        print(summary_df)

    # List all variables and allow the user to select one
    def list_variables(self):
        """List all variables in the dataset."""
        print("\nFollowing are the variables in our dataset:")
        for i, col in enumerate(self.df.columns, 1):
            print(f"{i}. {col}")
        print("0. Return to main menu")

    # Plot distribution of a specified column
    def plot_distribution(self, col):
        """Plot the distribution of the specified column."""
        if pd.api.types.is_numeric_dtype(self.df[col]):
            sns.histplot(self.df[col], kde=False)
            plt.title(f"Distribution plot for '{col}'")
            plt.xlabel(col)
            plt.ylabel("Frequency")
            plt.show()
        else:
            print(f"'{col}' is not a numeric variable, cannot plot distribution.")

    # Plot Q-Q plot
    def plot_qq(self, col):
        """Plot Q-Q plot to check normality."""
        data = self.df[col].dropna()
        sm.qqplot(data, line='45')
        plt.title(f"Q-Q Plot for '{col}'")
        plt.show()

    # Check normality
    def check_normality(self, col):
        """Check if the data is normally distributed and plot a Q-Q plot."""
        data = self.df[col].dropna()
        self.plot_qq(col)  # Plot Q-Q plot
        if len(data) > 2000:
            result = anderson(data)
            print(f"Anderson-Darling Test result: Statistic={result.statistic}, Critical Values={result.critical_values}")
            return result.statistic, result.critical_values[-1]  # Use the last (most lenient) critical value
        else:
            stat, p_value = shapiro(data)
            print(f"Shapiro-Wilk Test result: Statistic={stat}, p-value={p_value}")
            return stat, p_value

    # t-Test or Mann-Whitney U test
    def conduct_t_test(self):
        print("Following are the variables available for T-Test:")
        for column in self.df.columns:
            print(f"{column} (Type: {self.column_types[column]})")
        
        continuous_var = input("Enter the continuous variable: ")
        categorical_var = input("Enter the categorical variable: ")

        if continuous_var not in self.df.columns or categorical_var not in self.df.columns:
            print(f"Error: '{continuous_var}' or '{categorical_var}' variable does not exist in the DataFrame.")
            return

        # Create groups for T-Test
        groups = [group[continuous_var].dropna() for name, group in self.df.groupby(categorical_var)]

        normality_results = []
        for i, group in enumerate(groups):
            if len(group) > 0:
                stat, p = stats.shapiro(group)
                normality_results.append((stat, p))
                print(f"Group {i + 1}: W={stat}, p-value={p}")
            else:
                print(f"Group {i + 1} has no valid data for testing.")
                normality_results.append((np.nan, np.nan))

        if len(groups) > 0:
            print(f"Generating Q-Q Plot for the first group of {continuous_var}")
            plt.figure(figsize=(10, 6))
            sm.qqplot(groups[0], line='s')
            plt.title(f"Q-Q Plot of {continuous_var} for Group {categorical_var} = {self.df[categorical_var].unique()[0]}")
            plt.xlabel("Theoretical Quantiles")
            plt.ylabel("Sample Quantiles")
            plt.show()
            plt.close() 

        # If all groups passed normality test, perform T-Test
        if all(p > 0.05 for _, p in normality_results):
            stat, p_value = stats.ttest_ind(*groups)
            print(f"T-Test: Statistic={stat}, p-value={p_value}")

            plt.figure(figsize=(10, 6))
            sns.boxplot(x=categorical_var, y=continuous_var, data=self.df)
            plt.title(f"Boxplot of {continuous_var} by {categorical_var}")
            plt.xlabel(categorical_var)
            plt.ylabel(continuous_var)
            plt.show()
            plt.close() 
        else:
            print("Data is not normally distributed, unable to perform T-Test. Consider non-parametric tests.")
            
            if len(groups) > 0:
                plt.figure(figsize=(10, 6))
                plt.hist(groups[0], bins=20, alpha=0.7, color='blue')
                plt.title(f"Histogram of {continuous_var} for Group {categorical_var} = {self.df[categorical_var].unique()[0]}")
                plt.xlabel(continuous_var)
                plt.ylabel("Frequency")
                plt.axvline(np.mean(groups[0]), color='red', linestyle='dashed', linewidth=1)
                plt.axvline(np.median(groups[0]), color='green', linestyle='dashed', linewidth=1)
                plt.legend(['Mean', 'Median'])
                plt.show()
                plt.close() 
            
            # Print Shapiro-Wilk test results
            print("Shapiro-Wilk test results for each group (p-values):")
            for i, (_, p) in enumerate(normality_results):
                print(f"Group {i + 1}: p-value={p}")

    # ANOVA or Kruskal-Wallis Test
    def conduct_anova(self):
        print("For ANOVA, following are the variables available:")
        for column in self.df.columns:
            print(f"{column} (Type: {self.column_types[column]})")
        
        continuous_var = input("Enter the continuous variable: ")
        categorical_var = input("Enter the categorical variable: ")

        if continuous_var not in self.df.columns or categorical_var not in self.df.columns:
            print(f"Error: '{continuous_var}' or '{categorical_var}' variable does not exist in the DataFrame.")
            return

        try:
            # Convert continuous_var to numeric, invalid values will be converted to NaN
            self.df[continuous_var] = pd.to_numeric(self.df[continuous_var], errors='coerce')

            groups = [self.df[self.df[categorical_var] == group][continuous_var].dropna()
                    for group in self.df[categorical_var].unique()]

            if len(groups) < 2:
                print("Error: Not enough groups for ANOVA.")
                return

            stat, p_value = stats.f_oneway(*groups)
            print(f"ANOVA Test: Stat={stat}, p-value={p_value}")

            if p_value < 0.05:
                print(f"'{continuous_var}' has significant differences in means between categories of '{categorical_var}'.")
            else:
                print(f"No significant differences found for '{continuous_var}' between categories of '{categorical_var}'.")

            for group in groups:
                stat, p = stats.shapiro(group)
                if p < 0.05:
                    print(f"'{continuous_var}' is not normally distributed, displaying Q-Q plot and performing Kruskal-Wallis test.")
                    plt.figure(figsize=(10, 6))
                    sm.qqplot(self.df[continuous_var].dropna(), line='s')
                    plt.title(f"Q-Q Plot of {continuous_var}")
                    plt.xlabel("Theoretical Quantiles")
                    plt.ylabel("Sample Quantiles")
                    plt.show()

                    kw_stat, kw_p_value = stats.kruskal(*groups)
                    print(f"Kruskal-Wallis result:\n"
                          f"Kruskal-Wallis Statistic: {kw_stat:.10f}\n"
                          f"p-value: {kw_p_value:.10f}")

                    if kw_p_value < 0.05:
                        print("The result is statistically significant.")
                        print("Rejecting the null hypothesis.")
                        print(f"'{continuous_var}' has significant differences across '{categorical_var}' categories.\n")
                    else:
                        print("Cannot reject the null hypothesis.")

                    plt.figure(figsize=(10, 6))
                    sns.boxplot(x=categorical_var, y=continuous_var, data=self.df)
                    plt.title(f"Boxplot of {continuous_var} by {categorical_var} (Kruskal-Wallis Test)")
                    plt.xlabel(categorical_var)
                    plt.ylabel(continuous_var)
                    plt.show()
                    return

            plt.figure(figsize=(10, 6))
            sns.boxplot(x=categorical_var, y=continuous_var, data=self.df)
            plt.title(f"Boxplot of {continuous_var} by {categorical_var}")
            plt.xlabel(categorical_var)
            plt.ylabel(continuous_var)
            plt.show()

        except Exception as e:
            print(f"Error during ANOVA: {e}")

    # Chi-square test
    def perform_chi_square(self, cat_var1, cat_var2):
        contingency_table = pd.crosstab(self.df[cat_var1], self.df[cat_var2])
        stat, p_value, _, _ = chi2_contingency(contingency_table)
        print(f"Chi-square test result: Statistic={stat}, p-value={p_value}")

        if p_value < 0.05:
            print("Reject the null hypothesis, there is an association between the variables.")
        else:
            print("Cannot reject the null hypothesis, no significant association found between the variables.")

    # Regression analysis
    def perform_regression(self, x_var, y_var):
        X = self.df[x_var].dropna() 
        Y = self.df[y_var].dropna()

        X = pd.to_numeric(X, errors='coerce')  
        Y = pd.to_numeric(Y, errors='coerce')

        valid_index = X.notna() & Y.notna()
        X = X[valid_index]
        Y = Y[valid_index]

        if len(X) < 2 or len(Y) < 2:
            print("Error: Not enough valid numeric data for regression.")
            return

        slope, intercept, r_value, p_value, std_err = linregress(X, Y)
        print(f"Regression result: Slope={slope}, Intercept={intercept}, R-squared={r_value**2}, p-value={p_value}")
        if p_value < 0.05:
            print("Reject the null hypothesis, there is a significant relationship between the variables.")
        else:
            print("Cannot reject the null hypothesis, no significant relationship found between the variables.")

    # Sentiment analysis
    def perform_sentiment_analysis(self, col):
        analyzer = SentimentIntensityAnalyzer()
        scores, sentiments = [], []

        valid_texts = self.df[col].dropna().astype(str)  
        valid_texts = valid_texts[valid_texts.str.len() > 10] 

        if len(valid_texts) == 0:
            print("\nLooking for text data in your dataset...")
            print("Sorry, your dataset does not have suitable text data for sentiment analysis.")
            print("Returning to previous menu...\n")
            return 

        for text in valid_texts:
            try:
                score = analyzer.polarity_scores(text)['compound']
                scores.append(score)
                if score >= 0.05:
                    sentiments.append('positive')
                elif score <= -0.05:
                    sentiments.append('negative')
                else:
                    sentiments.append('neutral')
            except Exception as e:
                print(f"Error processing text: {text}. Error: {e}")
                scores.append(None)
                sentiments.append('neutral')

        sentiment_df = pd.DataFrame({'Text': valid_texts, 'Score': scores, 'Sentiment': sentiments})
        print(sentiment_df)

# Main program menu
def main():
    analysis = DataAnalysis()
    file_path = input("Please enter the dataset file path: ")
    analysis.dataset_loading(file_path)
    
    while True:
        print("\nHow do you want to analyze the data:")
        print("1. Plot variable distribution")
        print("2. Conduct ANOVA")
        print("3. Conduct t-Test")
        print("4. Conduct chi-Square")
        print("5. Conduct Regression")
        print("6. Conduct Sentiment Analysis")
        print("7. Quit")

        choice = input("Enter your choice (1 – 7): ")

        if choice == '1':
            analysis.list_variables()
            col_choice = input("\nEnter the variable number to visualize (or 0 to return to main menu): ")
            if col_choice.isdigit():
                col_index = int(col_choice)
                if col_index == 0:
                    continue
                elif 1 <= col_index <= len(analysis.df.columns):
                    col_name = analysis.df.columns[col_index - 1]
                    analysis.plot_distribution(col_name)
                else:
                    print("Invalid choice, please enter a valid number.")
            else:
                print("Please enter a valid number.")

        elif choice == '2':
            analysis.conduct_anova() 

        elif choice == '3':
            analysis.conduct_t_test()  

        elif choice == '4':
            cat_var1 = input("Enter the first categorical variable: ")
            cat_var2 = input("Enter the second categorical variable: ")
            analysis.perform_chi_square(cat_var1, cat_var2)

        elif choice == '5':
            x_var = input("Enter the independent variable: ")
            y_var = input("Enter the dependent variable: ")
            analysis.perform_regression(x_var, y_var)

        elif choice == '6':
            col = input("Enter the text column for sentiment analysis: ")
            if col in analysis.df.columns:
                analysis.perform_sentiment_analysis(col)
            else:
                print(f"Error: Column '{col}' does not exist in the dataset.")

        elif choice == '7':
            print("Exiting the program. Thank you!")
            break
        
        else:
            print("Invalid choice, please enter a valid option.")

if __name__ == "__main__":
    main()