In [42]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [43]:
# Load the data
data = pd.read_csv('./data_w_features/data_w_all_features_final_win10.csv')
data = data.dropna()

In [44]:
def ensemble_model(data, target_column='tone', drop_columns=['time_0.5', 'language', 'tone', 'participant', 'script'], train_size=0.8):
    
    
    # # Define features and target
    # X = data.drop(drop_columns, axis=1)
    # y = data[target_column]

    # # Split dataset into train and test sets
    # X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, stratify=y, random_state=1)

    # Define features and target
    X_train = data[data['participant'] != 'subject3'].drop(drop_columns, axis=1)
    y_train = data[data['participant'] != 'subject3'][target_column]
    X_test = data[data['participant'] == 'subject3'].drop(drop_columns, axis=1)
    y_test = data[data['participant'] == 'subject3'][target_column]    

    # Standardize features
    sc = StandardScaler()
    X_train_std = sc.fit_transform(X_train)
    X_test_std = sc.transform(X_test)

    # Define classifiers
    clf1 = LogisticRegression(solver='lbfgs', multi_class='auto', random_state=1)
    clf2 = DecisionTreeClassifier(max_depth=1, criterion='entropy', random_state=0)
    clf3 = SVC(kernel='rbf', random_state=1, gamma=0.10, C=1.0)

    # Combine classifiers into ensemble model
    eclf = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2), ('svc', clf3)], voting='hard')

    # Fit ensemble model
    eclf.fit(X_train_std, y_train)

    # Predict
    y_pred = eclf.predict(X_test_std)

    # Confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion matrix:\n", conf_matrix)

    # Print recall, precision, F1 score, accuracy
    print("Results on language:")
    print(classification_report(y_test, y_pred))

    return eclf, y_test, y_pred

In [45]:
eclf1, y1_test, y1_pred = ensemble_model(data, target_column='language')
eclf2, y2_test, y2_pred = ensemble_model(data, target_column='tone')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Confusion matrix:
 [[  9 511]
 [  9 923]]
Results on language:
              precision    recall  f1-score   support

          ch       0.50      0.02      0.03       520
          en       0.64      0.99      0.78       932

    accuracy                           0.64      1452
   macro avg       0.57      0.50      0.41      1452
weighted avg       0.59      0.64      0.51      1452



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Confusion matrix:
 [[399 430]
 [396 227]]
Results on language:
              precision    recall  f1-score   support

         bus       0.50      0.48      0.49       829
      casual       0.35      0.36      0.35       623

    accuracy                           0.43      1452
   macro avg       0.42      0.42      0.42      1452
weighted avg       0.43      0.43      0.43      1452



In [46]:
print(len(y1_test), len(y1_pred))
print(len(y2_test), len(y2_pred))

1452 1452
1452 1452


In [47]:
# Merge the two predictions
merged_label = []
merged_pred = []

for i in range(len(y1_test)):
    if y1_test.iloc[i] == 'ch' and y2_test.iloc[i] == 'bus':
        merged_label.append(1)
    elif y1_test.iloc[i] == 'ch' and y2_test.iloc[i] == 'casual':
        merged_label.append(2)
    elif y1_test.iloc[i] == 'en' and y2_test.iloc[i] == 'bus':
        merged_label.append(3)
    elif y1_test.iloc[i] == 'en' and y2_test.iloc[i] == 'casual':
        merged_label.append(4)
    else:
        merged_label.append(0)

    if y1_pred[i] == 'ch' and y2_pred[i] == 'bus':
        merged_pred.append(1)
    elif y1_pred[i] == 'ch' and y2_pred[i] == 'casual':
        merged_pred.append(2)
    elif y1_pred[i] == 'en' and y2_pred[i] == 'bus':
        merged_pred.append(3)
    elif y1_pred[i] == 'en' and y2_pred[i] == 'casual':
        merged_pred.append(4)
    else:
        merged_pred.append(0)

print("label 1 = ch & bus")
print("label 2 = ch & casual")
print("label 3 = en & bus")
print("label 4 = en & casual")

label 1 = ch & bus
label 2 = ch & casual
label 3 = en & bus
label 4 = en & casual


In [48]:
print(len(merged_label), len(merged_pred))
num_zeros = merged_pred.count(0)
print(num_zeros)

1452 1452
0


In [49]:
# Confusion matrix
conf_matrix = confusion_matrix(merged_label, merged_pred)
print("Confusion matrix (merged):\n", conf_matrix)

# Print recall, precision, F1 score, accuracy
print("Results on merged:")
print(classification_report(merged_label, merged_pred))

Confusion matrix (merged):
 [[  6   3 143 134]
 [  0   0 103 131]
 [  5   3 245 290]
 [  1   0 292  96]]
Results on merged:
              precision    recall  f1-score   support

           1       0.50      0.02      0.04       286
           2       0.00      0.00      0.00       234
           3       0.31      0.45      0.37       543
           4       0.15      0.25      0.18       389

    accuracy                           0.24      1452
   macro avg       0.24      0.18      0.15      1452
weighted avg       0.26      0.24      0.20      1452

