# Dataset Prep

## 1. Load Dataset

In [None]:
import pandas as pd

file_path = 'your_path_here'

df = pd.read_csv(file_path)

In [None]:
df

## 2. Select Required Rows

In [None]:
all_narratives_df = df[df['group_task'].str.contains("narrative")] # remove all rows that aren't what we want
all_narratives_df = all_narratives_df.drop(all_narratives_df[all_narratives_df['participant_id'] == 100].index) # remove participant 100
all_narratives_df = all_narratives_df.reset_index(drop=True)
all_narratives_df

## 3. Add Labels

In [None]:
input_df = all_narratives_df.copy()
input_df['is_L2'] = input_df['group_task'].apply(lambda x: 0 if 'nativespeaker' in x else 1)
input_df

In [None]:
input_df = input_df.dropna(axis='columns') # remove columns with NaN (null) values

input_df

# Correlation Analysis

In [None]:
ind_vars_dataset = input_df.drop(["group_task", "participant_id", "is_L2"], axis=1)
# correlation matrix
corr_matrix = ind_vars_dataset.corr()

In [None]:
corr_matrix

## VIF Calculation

In [None]:
# z-score normalization
means = ind_vars_dataset.mean()
std_devs = ind_vars_dataset.std()
z_scores = (ind_vars_dataset - means) / std_devs

In [None]:
z_scores

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data = pd.DataFrame()
vif_data["feature"] = z_scores.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(z_scores.values, i)
                          for i in range(len(z_scores.columns))]

In [None]:
vif_data

## Removing Highly Correlated Pairs

In [None]:
import numpy as np

# find highly correlated pairs

abs_corr_matrix = corr_matrix.abs()
# upper triangle only
upper = abs_corr_matrix.where(np.triu(np.ones(abs_corr_matrix.shape), k=1).astype(bool))

high_corr_pairs = [(col, row, upper.loc[row, col])
                   for col in upper.columns
                   for row in upper.index
                   if upper.loc[row, col] > 0.8]

high_corr_pairs.sort(key=lambda x: x[2], reverse=True)

for col1, col2, corr in high_corr_pairs:
    print(f"{col1}, {col2}: {corr:.2f}")

In [None]:
# discard the feature with a higher VIF

filtered_df = ind_vars_dataset

vif_data_indexed = vif_data.set_index('feature')

dropped_features = []

for col1, col2, corr in high_corr_pairs:
  print(f"comparing {col1} and {col2}")
  vif1 = vif_data_indexed.loc[col1, 'VIF']
  vif2 = vif_data_indexed.loc[col2, 'VIF']

  to_drop = col1 if vif1 >= vif2 else col2
  print(f"dropping {to_drop}")
  try:
    filtered_df = filtered_df.drop(columns=to_drop)
    dropped_features.append(to_drop)
  except:
    print(f"**already dropped {to_drop}**")

In [None]:
print(f"\ntotal dropped features = {len(dropped_features)}")
print(f"features dropped: {dropped_features}")

In [None]:
filtered_df

In [None]:
# update input df to only include desired features
input_df = input_df.drop(columns=dropped_features)
input_df

# Data Splitting

70% training, 30% testing

In [None]:
# imports
from sklearn.model_selection import train_test_split

In [None]:
X = input_df.drop(["group_task", "participant_id", "is_L2"], axis=1)
y = input_df["is_L2"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")

# Tuning Hyperparameters
Using cross validation

## SVM

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

In [None]:
svm_tune = SVC(random_state=42, class_weight="balanced") # basic model

In [None]:
svm_tune.get_params()

In [None]:
svm_param_dist = {
    'C': np.logspace(-2, 2, 7),  # Log scale, values from 0.01 to 100 - model will randomly sample from this range
    'kernel': ['linear', 'poly'],
    'gamma': ['scale', 'auto'],
    'degree': [3, 5, 7]
}

In [None]:
svm_random_search = RandomizedSearchCV(estimator=svm_tune, scoring='roc_auc', param_distributions=svm_param_dist, n_iter=100, cv=5, n_jobs=-1, random_state=42)
svm_random_search.fit(X_train, y_train)
svm_random_search

In [None]:
print("Best hyperparameters found: ", svm_random_search.best_params_)

### Evaluate

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
best_svm_model = svm_random_search.best_estimator_

In [None]:
best_svm_preds = best_svm_model.predict(X_test)
best_svm_probs = best_svm_model.predict_proba(X_test)[:, 1]

In [None]:
tuned_svm_cm = confusion_matrix(y_test, best_svm_preds)
print("SVM Confusion Matrix:")
print(tuned_svm_cm)

In [None]:
# heatmap
plt.figure(figsize=(5,4))
sns.heatmap(tuned_svm_cm, annot=True, fmt='d', cmap='Blues')
plt.title("SVM Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
print("Tuned SVM Classification Report:")
print(classification_report(y_test, best_svm_preds))

In [None]:
svm_fpr, svm_tpr, svm_thresholds = roc_curve(y_test, best_svm_probs)
svm_auc = auc(svm_fpr, svm_tpr)

plt.figure(figsize=(6,5))
plt.plot(svm_fpr, svm_tpr, label=f"SVM ROC (AUC = {svm_auc:.2f})")
plt.plot([0,1], [0,1], 'k--', label="Random Chance")
plt.title("SVM ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.show()

## DTC

In [None]:
# imports
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import numpy as np

In [None]:
dtc_tune = DecisionTreeClassifier(random_state=42, class_weight="balanced") # basic model

In [None]:
dtc_tune.get_params()

In [None]:
dtc_param_dist = {
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4, 8, 10],
    'max_features': ['sqrt', 'log2', None],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_leaf_nodes': [None, 10, 20, 50],
    'splitter': ['best', 'random']
}

In [None]:
dtc_random_search = GridSearchCV(estimator=dtc_tune, scoring='roc_auc', param_grid=dtc_param_dist, cv=5, n_jobs=-1)
dtc_random_search.fit(X_train, y_train)
dtc_random_search

In [None]:
print("Best hyperparameters found: ", dtc_random_search.best_params_)

### Evaluate

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
best_dtc_model = dtc_random_search.best_estimator_

In [None]:
best_dtc_preds = best_dtc_model.predict(X_test)
best_dtc_probs = best_dtc_model.predict_proba(X_test)[:, 1]

In [None]:
tuned_dtc_cm = confusion_matrix(y_test, best_dtc_preds)
print("DTC Confusion Matrix:")
print(tuned_dtc_cm)

In [None]:
# heatmap
plt.figure(figsize=(5,4))
sns.heatmap(tuned_dtc_cm, annot=True, fmt='d', cmap='Blues')
plt.title("DTC Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
print("Tuned DTC Classification Report:")
print(classification_report(y_test, best_dtc_preds))

In [None]:
dtc_fpr, dtc_tpr, dtc_thresholds = roc_curve(y_test, best_dtc_probs)
dtc_auc = auc(dtc_fpr, dtc_tpr)

plt.figure(figsize=(6,5))
plt.plot(dtc_fpr, dtc_tpr, label=f"DTC ROC (AUC = {dtc_auc:.2f})")
plt.plot([0,1], [0,1], 'k--', label="Random Chance")
plt.title("DTC ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.show()