In [7]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# --- 1. Veri Yükleme ve Temizleme ---
file_name = "Expanded_data_with_more_features (2).csv"
df = pd.read_csv(file_name)
df = df.drop('Unnamed: 0', axis=1)

# Eksik değerleri doldur
categorical_cols = df.select_dtypes(include='object').columns.tolist()
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])
df['NrSiblings'] = df['NrSiblings'].fillna(df['NrSiblings'].median()).astype(int)
df['AverageScore'] = df[['MathScore', 'ReadingScore', 'WritingScore']].mean(axis=1)

# Temizlenmiş veriyi kaydet (ÇIKTI 1)
df.to_csv('cleaned_data.csv', index=False)

# Hedef Sınıflandırma
threshold = df['AverageScore'].quantile(0.75)
df['High_Performance'] = (df['AverageScore'] >= threshold).astype(int)

score_cols = ['MathScore', 'ReadingScore', 'WritingScore', 'AverageScore']
Y = df['High_Performance']
X_full = df.drop(columns=score_cols + ['High_Performance'])

# Kategorik Encoding Tanımları
ordinal_features = ['WklyStudyHours']
nominal_features = [col for col in X_full.select_dtypes(include='object').columns if col not in ordinal_features]
numerical_features = X_full.select_dtypes(include=np.number).columns.tolist()
wkly_study_hours_order = ['< 5', '5 - 10', '> 10']


# --- 2. İstatistiksel Etki Analizi (ANOVA/Korelasyon) ---
scores = ['MathScore', 'ReadingScore', 'WritingScore', 'AverageScore']
anova_features = X_full.select_dtypes(include='object').columns.tolist()
corr_features = X_full.select_dtypes(include=np.number).columns.tolist()
results_list = []

for feature in anova_features:
    for score in scores:
        groups = [df[df[feature] == category][score].dropna() for category in df[feature].unique()]
        if len(groups) > 1:
            f_statistic, p_value = stats.f_oneway(*groups)
            results_list.append({'Feature': feature, 'Target': score, 'Test': 'ANOVA', 'p-value': p_value, 'Correlation': np.nan})

for feature in corr_features:
    for score in scores:
        corr, p_value = stats.pearsonr(df[feature], df[score])
        results_list.append({'Feature': feature, 'Target': score, 'Test': 'Pearson Correlation', 'p-value': p_value, 'Correlation': corr})

summary_df = pd.DataFrame(results_list)
summary_df['Significant Effect'] = summary_df['p-value'] < 0.05
summary_df = summary_df.sort_values(by=['Target', 'p-value']).reset_index(drop=True)

# Rapor için sonuç tablosunu kaydet (ÇIKTI 2)
report_df = summary_df[['Target', 'Feature', 'Test', 'p-value', 'Correlation', 'Significant Effect']].copy()
report_df['p-value'] = report_df['p-value'].apply(lambda x: f"{x:.3e}" if not pd.isna(x) else "")
report_df['Correlation'] = report_df['Correlation'].apply(lambda x: f"{x:.3f}" if not pd.isna(x) else "")
report_df.to_csv('feature_impact_summary.csv', index=False)


# --- 3. Modellerin Kurulumu ve Karşılaştırılması ---
def evaluate_model(model, X_test, Y_test, name):
    Y_pred = model.predict(X_test)
    precision = precision_score(Y_test, Y_pred, average='binary')
    recall = recall_score(Y_test, Y_pred, average='binary')
    f1 = f1_score(Y_test, Y_pred, average='binary')
    return {'Method': name, 'F1-Score': f1, 'Precision': precision, 'Recall': recall, 'Model': model, 'X_test': X_test, 'Y_test': Y_test}

# MODEL 1: FEATURE SELECTION (FS)
features_to_drop_fs = ['ParentMaritalStatus', 'TransportMeans', 'NrSiblings']
X_fs = X_full.drop(columns=features_to_drop_fs)
nominal_fs = [col for col in nominal_features if col not in features_to_drop_fs]
numerical_fs = [col for col in numerical_features if col not in features_to_drop_fs]

preprocessor_fs = ColumnTransformer(
    transformers=[
        ('ord', OrdinalEncoder(categories=[wkly_study_hours_order]), ordinal_features),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), nominal_fs),
        ('num', StandardScaler(), numerical_fs) 
    ],
    remainder='drop'
)
pipeline_fs = Pipeline(steps=[('preprocessor', preprocessor_fs), ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))])
X_train_fs, X_test_fs, Y_train, Y_test = train_test_split(X_fs, Y, test_size=0.3, random_state=42, stratify=Y)
pipeline_fs.fit(X_train_fs, Y_train)
results_fs = evaluate_model(pipeline_fs, X_test_fs, Y_test, "Feature Selection (FS)")

# MODEL 2: PCA (FEATURE EXTRACTION)
preprocessor_pca = ColumnTransformer(
    transformers=[
        ('ord', OrdinalEncoder(categories=[wkly_study_hours_order]), ordinal_features),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False), nominal_features),
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='drop'
)
pipeline_pca = Pipeline(steps=[
    ('preprocessor', preprocessor_pca),
    ('pca', PCA(n_components=0.95, random_state=42)),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])
X_train_pca, X_test_pca, Y_train, Y_test = train_test_split(X_full, Y, test_size=0.3, random_state=42, stratify=Y)
pipeline_pca.fit(X_train_pca, Y_train)
results_pca = evaluate_model(pipeline_pca, X_test_pca, Y_test, "PCA (Feature Extraction)")

# Sonuçların Karşılaştırılması
comparison_df = pd.DataFrame([results_fs, results_pca])
comparison_df = comparison_df.sort_values(by='F1-Score', ascending=False).reset_index(drop=True)

# Karşılaştırma tablosunu kaydet
comparison_df[['Method', 'F1-Score', 'Precision', 'Recall']].to_csv('model_comparison_metrics.csv', index=False)

# --- 4. En İyi Modelin Detaylı Raporlaması ---
# Feature Selection (FS) modeli, en yüksek F1-Score'a sahip olduğu için en iyi model kabul edilir.
best_model_results = results_fs
best_model_pipeline = best_model_results['Model']
Y_pred_best = best_model_pipeline.predict(best_model_results['X_test'])

# Classification Report'u kaydet 
report = classification_report(best_model_results['Y_test'], Y_pred_best, target_names=['Normal Performance (0)', 'High Performance (1)'])
with open('classification_report.txt', 'w') as f:
    f.write(report)

# Feature Importance Çıktıları
classifier = best_model_pipeline.named_steps['classifier']
preprocessor = best_model_pipeline.named_steps['preprocessor']

# OHE sonrası feature isimlerini doğru al
ohe_feature_names = list(preprocessor.named_transformers_['onehot'].get_feature_names_out(nominal_fs))
all_feature_names = ordinal_features + ohe_feature_names + numerical_features

# FS'de 3 sütun düşürüldüğü için X_fs'in sütunlarını kullan
feature_names_fs = X_test_fs.columns.tolist()

processed_feature_names = ordinal_features + nominal_fs
# Numerik sütunlar (dropped: NrSiblings) da eklenmeliydi ama FS'de zaten düşürüldü.
# Kodun çalıştığı feature isimleri OHE sonrası oluşan toplam 23 sütundur.
# Doğru feature isimlerini çıkarım:
processed_X_train = preprocessor_fs.transform(X_train_fs)
all_feature_names = list(preprocessor_fs.get_feature_names_out())


feature_importances = classifier.feature_importances_
importance_df = pd.DataFrame({
    'Feature': all_feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Özellik Önemini kaydet
importance_df.to_csv('model_feature_importance.csv', index=False)

# Grafik oluştur ve kaydet
plt.figure(figsize=(12, 8))
plt.barh(importance_df['Feature'][:15], importance_df['Importance'][:15], color='darkred') # İlk 15'i gösterelim
plt.xlabel('Özellik Önem Derecesi (Gini)')
plt.ylabel('Özellik')
plt.title('Random Forest Özellik Önem Sıralaması (Feature Selection Modeli)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('random_forest_feature_importance.png')
plt.close()

print("Tüm işlemler tamamlandı. Ödev için gerekli 6 çıktı dosyası başarıyla üretilmiştir.")

Tüm işlemler tamamlandı. Ödev için gerekli 6 çıktı dosyası başarıyla üretilmiştir.
