## Data Preprocessing

In [1]:
import pandas as pd

In [2]:
phone_data = pd.read_csv('../textgrid_to_feature/all_textgrid_features.csv')
opensmile_data = pd.read_csv("../wav_to_feature_opensmile/all_features_combined.csv")

In [3]:
phone_data.head(5)

Unnamed: 0,source_file,iu_text,iu_start,iu_end,AA0,AA1,AA2,AE0,AE1,AE2,...,Y,Z,ZH,sil,sp,spn,cs_unk,cs_other_language,cs_non_english,cs_tokens
0,VF32A_English_I2_20190213,uh,8.41385,8.50385,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
1,VF32A_English_I2_20190213,i was born in vancouver,8.50385,9.78385,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,
2,VF32A_English_I2_20190213,locally,14.04385,14.52385,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
3,VF32A_English_I2_20190213,yes,14.52385,14.74385,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,
4,VF32A_English_I2_20190213,on the lower mainland,14.74385,15.60385,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,


In [4]:
opensmile_data.rename(columns={'filename': 'source_file'}, inplace=True)

In [5]:
full_joined_df = pd.merge(phone_data, opensmile_data, on=['source_file', 'iu_start', 'iu_end', 'iu_text'], how='outer')
full_joined_df

Unnamed: 0,source_file,iu_text,iu_start,iu_end,AA0,AA1,AA2,AE0,AE1,AE2,...,mfcc_sma_de[14]_peakRangeRel,mfcc_sma_de[14]_peakMeanAbs,mfcc_sma_de[14]_peakMeanMeanDist,mfcc_sma_de[14]_peakMeanRel,mfcc_sma_de[14]_minRangeRel,mfcc_sma_de[14]_meanRisingSlope,mfcc_sma_de[14]_stddevRisingSlope,mfcc_sma_de[14]_meanFallingSlope,mfcc_sma_de[14]_stddevFallingSlope,iu_index
0,VF19A_English_I1_20181114,i was born in vancouver,12.12539,13.61539,0,0,0,1,0,0,...,0.397643,1.991329,1.978806,19.780094,0.512881,93.984740,56.653587,65.43952,43.070736,0.0
1,VF19A_English_I1_20181114,and i've lived in burnaby slash vancouver for ...,13.61539,16.68539,0,0,0,1,1,0,...,0.566905,1.562064,1.582763,-18.917679,0.435467,72.821840,42.984330,74.03557,39.926050,1.0
2,VF19A_English_I1_20181114,i went to school in burnaby,21.00939,22.30939,0,0,0,0,0,0,...,0.395774,1.961130,1.836907,13.104545,0.525650,84.599106,32.970352,79.74679,38.356453,2.0
3,VF19A_English_I1_20181114,preschool,25.25539,25.77539,0,0,0,0,0,0,...,0.329192,2.603495,2.588611,19.828833,0.478456,101.590614,65.563330,63.26734,60.372260,3.0
4,VF19A_English_I1_20181114,elementary,25.77539,26.23539,0,0,0,0,0,0,...,0.495244,0.793145,1.124541,-2.393347,0.210996,62.771336,14.432039,32.79706,19.180704,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21412,VM34A_English_I2_20191028,kind of wacky,1329.13062,1329.75062,0,0,0,0,1,0,...,,,,,,,,,,
21413,VM34A_English_I2_20191028,you just,1329.75062,1330.11062,0,0,0,0,0,0,...,,,,,,,,,,
21414,VM34A_English_I2_20191028,question twice,1330.72762,1331.39762,0,0,0,0,0,0,...,,,,,,,,,,
21415,VM34A_English_I2_20191028,on using like,1331.39762,1331.96762,0,1,0,0,0,0,...,,,,,,,,,,


## Baseline model

In [6]:
from sklearn.model_selection import train_test_split

X = full_joined_df.drop(columns=['source_file', 'iu_start', 'iu_end', 'iu_text', 'cs_unk', 'cs_other_language', 'cs_non_english', 'cs_tokens'])
y = full_joined_df['cs_non_english']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,                # Feature matrix and target vector
    test_size=0.2,       # 20% for testing, 80% for training
    random_state=42      # Ensures reproducibility
)


In [7]:
y_train.value_counts()

cs_non_english
0    16885
1      248
Name: count, dtype: int64

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

# Handle missing values
# Use median for continuous features and mode for binary features
imputer_continuous = SimpleImputer(strategy='median')
imputer_binary = SimpleImputer(strategy='most_frequent')

# Identify binary and continuous columns
binary_cols = [col for col in X_train.columns if X_train[col].nunique() <= 2]  # Binary if unique values <= 2
continuous_cols = [col for col in X_train.columns if col not in binary_cols]

# Apply imputation
X_train_binary = pd.DataFrame(imputer_binary.fit_transform(X_train[binary_cols]), 
                              columns=binary_cols, index=X_train.index)
X_test_binary = pd.DataFrame(imputer_binary.transform(X_test[binary_cols]), 
                             columns=binary_cols, index=X_test.index)

X_train_continuous = pd.DataFrame(imputer_continuous.fit_transform(X_train[continuous_cols]), 
                                  columns=continuous_cols, index=X_train.index)
X_test_continuous = pd.DataFrame(imputer_continuous.transform(X_test[continuous_cols]), 
                                 columns=continuous_cols, index=X_test.index)

# Scale only continuous features
scaler = StandardScaler()
X_train_continuous_scaled = pd.DataFrame(scaler.fit_transform(X_train_continuous), 
                                         columns=continuous_cols, index=X_train.index)
X_test_continuous_scaled = pd.DataFrame(scaler.transform(X_test_continuous), 
                                        columns=continuous_cols, index=X_test.index)

# Combine binary and scaled continuous features
X_train_final = pd.concat([X_train_binary, X_train_continuous_scaled], axis=1)
X_test_final = pd.concat([X_test_binary, X_test_continuous_scaled], axis=1)

# Initialize and train the Logistic Regression model
logreg = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
logreg.fit(X_train_final, y_train)

# Make predictions on the test set
y_pred = logreg.predict(X_test_final)  # Corrected this line

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X_train_final.columns,
    'coefficient': np.abs(logreg.coef_[0])
})
feature_importance = feature_importance.sort_values(by='coefficient', ascending=False)
print("\nTop 10 Feature Importances:")
print(feature_importance.head(10))


Accuracy: 0.9909

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      4226
           1       0.61      0.93      0.73        58

    accuracy                           0.99      4284
   macro avg       0.80      0.96      0.87      4284
weighted avg       0.99      0.99      0.99      4284


Top 10 Feature Importances:
   feature  coefficient
66     spn     2.575982
11     AH0     0.779382
9      AE1     0.772600
23      DH     0.687471
5      sil     0.672834
29     EY1     0.672585
20       B     0.619422
27     ER0     0.599388
52       S     0.563416
34     IH0     0.542355


## Model selection and optimization

In [9]:
## Model Selection and Optimization

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import make_scorer, f1_score
import numpy as np

# Define models for comparison
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced'),
    'RandomForest': RandomForestClassifier(random_state=42, class_weight='balanced'),
    'SVC': SVC(random_state=42, class_weight='balanced')
}

# Define scorer for F1 score of the minority class (cs_non_english=1)
f1_scorer = make_scorer(f1_score, pos_label=1)

# Perform cross-validation and compare models
print("Cross-Validation F1 Scores:")
model_scores = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train_final, y_train, cv=5, scoring=f1_scorer, n_jobs=-1)
    mean_score = np.mean(scores)
    std_score = np.std(scores)
    model_scores[name] = mean_score
    print(f"{name}: Mean F1 = {mean_score:.4f} (+/- {std_score:.4f})")

# Select the best model
best_model_name = max(model_scores, key=model_scores.get)
best_model = models[best_model_name]
print(f"\nBest Model: {best_model_name} with F1 Score: {model_scores[best_model_name]:.4f}")

# Hyperparameter optimization for the best model
print("\nPerforming Hyperparameter Optimization for", best_model_name)



Cross-Validation F1 Scores:
LogisticRegression: Mean F1 = 0.5872 (+/- 0.0316)
RandomForest: Mean F1 = 0.3230 (+/- 0.0496)
SVC: Mean F1 = 0.6634 (+/- 0.0153)

Best Model: SVC with F1 Score: 0.6634

Performing Hyperparameter Optimization for SVC
