In [37]:
import pandas as pd
import csv
import numpy as np
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder

In [38]:
# Open the original CSV file and create a new CSV file for corrected data
with open('data.csv', 'r') as original_file, open('corrected.csv', 'w', newline='') as corrected_file:
    # Create a CSV reader and writer objects
    reader = csv.reader(original_file, delimiter=';')
    writer = csv.writer(corrected_file)
    
    # Read the header and write it to the new CSV file
    header = next(reader)
    writer.writerow(header)
    
    # Iterate over each row in the original CSV
    for row in reader:
        # Split the single column into multiple columns
        # Write the split data into the new CSV file
        writer.writerow(row)

In [39]:
# Load the dataset
df = pd.read_csv('corrected.csv')
df.head()

# Preprocess the data
X = df.drop(['Curricular units 1st sem (grade)', 'Curricular units 2nd sem (grade)'], axis=1)  # features
y = df['GDP']  # target variable (dropout)

In [40]:
# Label Encoding
label_encoder = OneHotEncoder()
y_encoded = label_encoder.fit_transform(y)

# One-Hot Encoding (if there are more than two classes)
# If there are only two classes, Label Encoding is sufficient
y_encoded = pd.get_dummies(y)


# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y_encoded)

# X_res, y_res = smote.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

# Feature Selection using SelectKBest
k = 10  # select top 10 features
selector = SelectKBest(mutual_info_classif, k=k)
X_train_selected = selector.fit_transform(X_train_std, y_train)
X_test_selected = selector.transform(X_test_std)

# Feature Extraction using PCA
pca = PCA(n_components=0.95)  # retain 95% of the variance
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

# Feature Extraction using t-SNE
tsne = TSNE(n_components=2, random_state=42)
X_train_tsne = tsne.fit_transform(X_train_std)
X_test_tsne = tsne.transform(X_test_std)

# Train a random forest classifier on the original data
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train_std, y_train)
y_pred_rfc = rfc.predict(X_test_std)
print("Original Data:")
print("Accuracy:", accuracy_score(y_test, y_pred_rfc))
print("Classification Report:")
print(classification_report(y_test, y_pred_rfc))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rfc))

# Train a random forest classifier on the selected features
rfc_selected = RandomForestClassifier(n_estimators=100, random_state=42)
rfc_selected.fit(X_train_selected, y_train)
y_pred_rfc_selected = rfc_selected.predict(X_test_selected)
print("Selected Features:")
print("Accuracy:", accuracy_score(y_test, y_pred_rfc_selected))
print("Classification Report:")
print(classification_report(y_test, y_pred_rfc_selected))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rfc_selected))

# Train a random forest classifier on the PCA features
rfc_pca = RandomForestClassifier(n_estimators=100, random_state=42)
rfc_pca.fit(X_train_pca, y_train)
y_pred_rfc_pca = rfc_pca.predict(X_test_pca)
print("PCA Features:")
print("Accuracy:", accuracy_score(y_test, y_pred_rfc_pca))
print("Classification Report:")
print(classification_report(y_test, y_pred_rfc_pca))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rfc_pca))

# Train a random forest classifier on the t-SNE features
rfc_tsne = RandomForestClassifier(n_estimators=100, random_state=42)
rfc_tsne.fit(X_train_tsne, y_train)
y_pred_rfc_tsne = rfc_tsne.predict(X_test_tsne)
print("t-SNE Features:")
print("Accuracy:", accuracy_score(y_test, y_pred_rfc_tsne))
print("Classification Report:")
print(classification_report(y_test, y_pred_rfc_tsne))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rfc_tsne))

ValueError: Expected a 2-dimensional container but got <class 'pandas.core.series.Series'> instead. Pass a DataFrame containing a single row (i.e. single sample) or a single column (i.e. single feature) instead.