In [None]:
# model_training.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

def train_and_analyze():
    # Load preprocessed data
    df_model = pd.read_csv('preprocessed_data.csv')
    print(f"Data Shape: {df_model.shape}")

    # Identify non-numeric columns
    print("Identifying non-numeric columns...")
    non_numeric_columns = df_model.select_dtypes(include=['object']).columns.tolist()
    print(f"Non-numeric columns: {non_numeric_columns}")

    if non_numeric_columns:
        print("Warning: Non-numeric columns detected. Dropping them.")
        df_model = df_model.drop(columns=non_numeric_columns)

    # Ensure all data is numeric
    print("Ensuring all data is numeric...")
    df_model = df_model.apply(pd.to_numeric, errors='coerce')
    df_model = df_model.dropna()
    print(f"Data Shape after dropping NaNs: {df_model.shape}")

    # Define target variable and features
    y = df_model['ScaleScore']
    X = df_model.drop(['ScaleScore'], axis=1)

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

    # ----------------------------------------
    # 1. Correlation Analysis
    # ----------------------------------------
    print("\nPerforming Correlation Analysis...")
    corr_matrix = df_model.corr()
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, cmap='coolwarm', linewidths=0.5)
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.savefig('correlation_matrix.png')
    plt.close()
    print("Correlation matrix saved as 'correlation_matrix.png'.")

    # Identify top correlated features with the target variable
    corr_with_target = corr_matrix['ScaleScore'].abs().sort_values(ascending=False)
    print("\nTop features correlated with ScaleScore:")
    print(corr_with_target.head(10))

    # ----------------------------------------
    # 2. Feature Importance using Random Forest
    # ----------------------------------------
    print("\nTraining Random Forest Regressor for Feature Importance...")
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # Save the model and feature list
    joblib.dump(rf_model, 'model_deployment/student_performance_model.pkl')
    joblib.dump(X.columns.tolist(), 'model_deployment/model_features.pkl')

    # Feature importance
    importances = rf_model.feature_importances_
    feature_names = X.columns
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    })
    feature_importance_df = feature_importance_df.sort_values(
        by='Importance', ascending=False)

    # Plot Feature Importance
    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(20))
    plt.title('Top 20 Feature Importances')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()
    print("Feature importance plot saved as 'feature_importance.png'.")

    # ----------------------------------------
    # 3. Model Evaluation
    # ----------------------------------------
    print("\nEvaluating Random Forest Model...")
    y_pred_rf = rf_model.predict(X_test)
    rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
    r2_rf = r2_score(y_test, y_pred_rf)
    print(f"Random Forest RMSE: {rmse_rf:.2f}")
    print(f"Random Forest R^2 Score: {r2_rf:.2f}")

    # 4. Model Comparison

    print("\nComparing Different Models...")

    # Linear Regression
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    y_pred_lr = lr_model.predict(X_test)
    rmse_lr = mean_squared_error(y_test, y_pred_lr, squared=False)
    r2_lr = r2_score(y_test, y_pred_lr)
    print(f"Linear Regression RMSE: {rmse_lr:.2f}")
    print(f"Linear Regression R^2 Score: {r2_lr:.2f}")

    # Gradient Boosting Regressor
    gb_model = GradientBoostingRegressor(random_state=42)
    gb_model.fit(X_train, y_train)
    y_pred_gb = gb_model.predict(X_test)
    rmse_gb = mean_squared_error(y_test, y_pred_gb, squared=False)
    r2_gb = r2_score(y_test, y_pred_gb)
    print(f"Gradient Boosting RMSE: {rmse_gb:.2f}")
    print(f"Gradient Boosting R^2 Score: {r2_gb:.2f}")

    # Compare RMSE and R^2
    models_evaluation = pd.DataFrame({
        'Model': ['Random Forest', 'Linear Regression', 'Gradient Boosting'],
        'RMSE': [rmse_rf, rmse_lr, rmse_gb],
        'R^2 Score': [r2_rf, r2_lr, r2_gb]
    })
    print("\nModel Evaluation Summary:")
    print(models_evaluation)

    # 6. Clustering Analysis

    print("\nPerforming Clustering Analysis...")
    # Using KMeans clustering
    kmeans = KMeans(n_clusters=3, random_state=42)
    kmeans.fit(X)
    df_model['Cluster'] = kmeans.labels_

    # 7. Statistical Analysis

    print("\nPerforming Statistical Analysis...")

    # Example: Compare ScaleScore between genders
    if 'Gender_Male' in df_model.columns and 'Gender_Female' in df_model.columns:
        male_scores = df_model[df_model['Gender_Male'] == 1]['ScaleScore']
        female_scores = df_model[df_model['Gender_Female'] == 1]['ScaleScore']

        # T-test
        t_stat, p_value = stats.ttest_ind(male_scores, female_scores, equal_var=False)
        print(f"T-test between male and female ScaleScores:")
        print(f"T-statistic: {t_stat:.2f}, P-value: {p_value:.4f}")
    else:
        print("Gender columns not found for statistical analysis.")

    # 8. Anomaly Detection

    print("\nPerforming Anomaly Detection...")
    from sklearn.ensemble import IsolationForest
    iso_forest = IsolationForest(contamination=0.01, random_state=42)
    anomalies = iso_forest.fit_predict(X)
    df_model['Anomaly'] = anomalies

    num_anomalies = (df_model['Anomaly'] == -1).sum()
    print(f"Number of anomalies detected: {num_anomalies}")

    # 9. Data Visualization

    print("\nCreating Data Visualizations...")

    # Distribution of ScaleScore
    plt.figure(figsize=(8, 6))
    sns.histplot(df_model['ScaleScore'], kde=True)
    plt.title('Distribution of Scale Scores')
    plt.xlabel('Scale Score')
    plt.ylabel('Frequency')
    plt.savefig('scale_score_distribution.png')
    plt.close()
    print("Scale score distribution plot saved as 'scale_score_distribution.png'.")

    # Boxplot of ScaleScore by Grade
    plt.figure(figsize=(12, 8))
    grade_cols = [col for col in df_model.columns if 'Grade_' in col]
    if grade_cols:
        df_model['Grade'] = df_model[grade_cols].idxmax(axis=1).str.replace('Grade_', '')
        sns.boxplot(x='Grade', y='ScaleScore', data=df_model)
        plt.title('Scale Score by Grade')
        plt.xlabel('Grade')
        plt.ylabel('Scale Score')
        plt.savefig('scale_score_by_grade.png')
        plt.close()
        print("Scale score by grade plot saved as 'scale_score_by_grade.png'.")
    else:
        print("Grade columns not found for boxplot.")

    # 10. Predictive Insights

    print("\nGenerating Predictive Insights...")

    # Identify top 5 students with the lowest predicted scores
    X_test_copy = X_test.copy()
    X_test_copy['PredictedScaleScore'] = y_pred_rf
    X_test_copy['ActualScaleScore'] = y_test
    X_test_copy['Residual'] = y_test - y_pred_rf
    lowest_scores = X_test_copy.nsmallest(5, 'PredictedScaleScore')
    print("Students with lowest predicted Scale Scores:")
    print(lowest_scores[['PredictedScaleScore', 'ActualScaleScore', 'Residual']])

    # 11. Save Evaluation Metrics

    models_evaluation.to_csv('model_evaluation_summary.csv', index=False)
    print("Model evaluation summary saved as 'model_evaluation_summary.csv'.")

if __name__ == "__main__":
    train_and_analyze()
