# Demonstration of data loading and model training with BERT vectors

In [18]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import sklearn.metrics

In [2]:
ORIGINAL_DATA_DIR = os.path.join("..", "handout", "data")
BERT_FEATURE_DIR = "bert_output_data"

## Format training data

`X` will be a matrix with `N` rows for the `N` texts in the training data, and `M` columns for the `M` features generated by BERT.

`y` will be an array of `N` class labels for training.

In [3]:
train_df = pd.read_csv(os.path.join(ORIGINAL_DATA_DIR, "lang_id_train.csv"))

In [4]:
train_df.shape

(6000, 2)

In [5]:
bert_vectors = []
with open(os.path.join(BERT_FEATURE_DIR, "train.jsonlines"), "rt") as infile:
    for line in infile:
        bert_data = json.loads(line)
        for t in bert_data["features"]:
            # Only extract the [CLS] vector used for classification
            if t["token"] == "[CLS]":
                # We only use the representation at the final layer of the network
                bert_vectors.append(t["layers"][0]["values"])
                break

In [6]:
len(bert_vectors)

6000

In [7]:
X = np.array(bert_vectors)
y = train_df["native_language"].values

## Train logistic regression model

In [8]:
lr_model = LogisticRegression(penalty="l2", C=1.0)
lr_model.fit(X, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
test_df = pd.read_csv(os.path.join(ORIGINAL_DATA_DIR, "lang_id_test.csv"))

In [10]:
print(test_df.shape)

(2000, 2)


In [11]:
test_vectors = []
with open(os.path.join(BERT_FEATURE_DIR, "test.jsonlines"), "rt") as infile:
    for line in infile:
        bert_data = json.loads(line)
        for t in bert_data["features"]:
            # Only extract the [CLS] vector used for classification
            if t["token"] == "[CLS]":
                # We only use the representation at the final layer of the network
                test_vectors.append(t["layers"][0]["values"])
                break

In [12]:
print(len(test_vectors))

2000


In [13]:
x_test = np.array(test_vectors)
y_test = test_df["native_language"].values

In [26]:
test_labels = lr_model.predict(x_test)
accuracy = lr_model.score(x_test, y_test)
print("Accuracy% : " + str(accuracy*100))

Accuracy% : 45.6


In [15]:
test_df['predicted_language'] = test_labels
test_df['result']=np.where(test_df['native_language'] == test_df['predicted_language'],'Yes','No')

In [16]:
arabic_df = test_df[test_df.native_language.isin(['Arabic'])]
cantonese_df = test_df[test_df.native_language.isin(['Cantonese'])]
japanese_df = test_df[test_df.native_language.isin(['Japanese'])]
korean_df = test_df[test_df.native_language.isin(['Korean'])]
mandarin_df = test_df[test_df.native_language.isin(['Mandarin'])]
polish_df = test_df[test_df.native_language.isin(['Polish'])]
russian_df = test_df[test_df.native_language.isin(['Russian'])]
spanish_df = test_df[test_df.native_language.isin(['Spanish'])]
thai_df = test_df[test_df.native_language.isin(['Thai'])]
vietnamese_df = test_df[test_df.native_language.isin(['Vietnamese'])]

In [30]:
precision,recall,fscore,support = sklearn.metrics.precision_recall_fscore_support(test_df['native_language'], test_df['predicted_language'])

print("METRICS BY CLASS: \n")
print("Arabic:")
print("Missclassification Rate:", 1-sklearn.metrics.accuracy_score(arabic_df['native_language'], arabic_df['predicted_language']))
print("Precision:", precision[0])
print("Recall:", recall[0])
print("Fscore:", fscore[0])

print("\nCantonese:")
print("Missclassification Rate:", 1-sklearn.metrics.accuracy_score(cantonese_df['native_language'], cantonese_df['predicted_language']))
print("Precision:", precision[1])
print("Recall:", recall[1])
print("Fscore:", fscore[1])

print("\nJapanese:")
print("Missclassification Rate:", 1-sklearn.metrics.accuracy_score(japanese_df['native_language'], japanese_df['predicted_language']))
print("Precision:", precision[2])
print("Recall:", recall[2])
print("Fscore:", fscore[2])

print("\nKorean:")
print("Missclassification Rate:", 1-sklearn.metrics.accuracy_score(korean_df['native_language'], korean_df['predicted_language']))
print("Precision:", precision[3])
print("Recall:", recall[3])
print("Fscore:", fscore[3])


print("\nMandarin:")
print("Missclassification Rate:", 1-sklearn.metrics.accuracy_score(mandarin_df['native_language'], mandarin_df['predicted_language']))
print("Precision:", precision[4])
print("Recall:", recall[4])
print("Fscore:", fscore[4])

print("\nPolish:")
print("Missclassification Rate:", 1-sklearn.metrics.accuracy_score(polish_df['native_language'], polish_df['predicted_language']))
print("Precision:", precision[5])
print("Recall:", recall[5])
print("Fscore:", fscore[5])

print("\nRussian:")
print("Missclassification Rate:", 1-sklearn.metrics.accuracy_score(russian_df['native_language'], russian_df['predicted_language']))
print("Precision:", precision[6])
print("Recall:", recall[6])
print("Fscore:", fscore[6])

print("\nSpanish:")
print("Missclassification Rate:", 1-sklearn.metrics.accuracy_score(spanish_df['native_language'], spanish_df['predicted_language']))
print("Precision:", precision[7])
print("Recall:", recall[7])
print("Fscore:", fscore[7])

print("\nThai:")
print("Missclassification Rate:", 1-sklearn.metrics.accuracy_score(thai_df['native_language'], thai_df['predicted_language']))
print("Precision:", precision[8])
print("Recall:", recall[8])
print("Fscore:", fscore[8])

print("\nVietnamese:")
print("Missclassification Rate:", 1-sklearn.metrics.accuracy_score(vietnamese_df['native_language'], vietnamese_df['predicted_language']))
print("Precision:", precision[9])
print("Recall:", recall[9])
print("Fscore:", fscore[9])

METRICS BY CLASS: 

Arabic:
Missclassification Rate: 0.515
Precision: 0.4801980198019802
Recall: 0.485
Fscore: 0.4825870646766169

Cantonese:
Missclassification Rate: 0.71
Precision: 0.30851063829787234
Recall: 0.29
Fscore: 0.29896907216494845

Japanese:
Missclassification Rate: 0.475
Precision: 0.47297297297297297
Recall: 0.525
Fscore: 0.49763033175355453

Korean:
Missclassification Rate: 0.5800000000000001
Precision: 0.46153846153846156
Recall: 0.42
Fscore: 0.4397905759162304

Mandarin:
Missclassification Rate: 0.685
Precision: 0.3088235294117647
Recall: 0.315
Fscore: 0.31188118811881194

Polish:
Missclassification Rate: 0.52
Precision: 0.4752475247524752
Recall: 0.48
Fscore: 0.47761194029850745

Russian:
Missclassification Rate: 0.43000000000000005
Precision: 0.5089285714285714
Recall: 0.57
Fscore: 0.5377358490566037

Spanish:
Missclassification Rate: 0.5
Precision: 0.5154639175257731
Recall: 0.5
Fscore: 0.5076142131979695

Thai:
Missclassification Rate: 0.41000000000000003
Precisio

In [28]:
#Confusion matrix determines the frequency of misscalssifications between different classes
confusion_matrix = sklearn.metrics.confusion_matrix(test_df['native_language'], test_df['predicted_language'])
print("Confusion Matrix: \n", confusion_matrix)

Confusion Matrix: 
 [[ 97   8   8  12  12  17  13  16   5  12]
 [ 10  58  14  15  48  10   8   6  14  17]
 [ 15  12 105  16   9  13   5   7  10   8]
 [  4  17  25  84  16   6  13   7  16  12]
 [ 13  40  14  14  63   7  10  12  10  17]
 [ 11   9   6   6   7  96  33  15   8   9]
 [  8  10  13   6   8  21 114  11   1   8]
 [ 18   7  14   5  12  11  18 100   6   9]
 [ 16  10   9  10   8   4   2  10 118  13]
 [ 10  17  14  14  21  17   8  10  12  77]]


In [29]:
misclassification_rate = confusion_matrix/200
print("Misclassifications rate between each pair of classes: \n", misclassification_rate)

Misclassifications rate between each pair of classes: 
 [[0.485 0.04  0.04  0.06  0.06  0.085 0.065 0.08  0.025 0.06 ]
 [0.05  0.29  0.07  0.075 0.24  0.05  0.04  0.03  0.07  0.085]
 [0.075 0.06  0.525 0.08  0.045 0.065 0.025 0.035 0.05  0.04 ]
 [0.02  0.085 0.125 0.42  0.08  0.03  0.065 0.035 0.08  0.06 ]
 [0.065 0.2   0.07  0.07  0.315 0.035 0.05  0.06  0.05  0.085]
 [0.055 0.045 0.03  0.03  0.035 0.48  0.165 0.075 0.04  0.045]
 [0.04  0.05  0.065 0.03  0.04  0.105 0.57  0.055 0.005 0.04 ]
 [0.09  0.035 0.07  0.025 0.06  0.055 0.09  0.5   0.03  0.045]
 [0.08  0.05  0.045 0.05  0.04  0.02  0.01  0.05  0.59  0.065]
 [0.05  0.085 0.07  0.07  0.105 0.085 0.04  0.05  0.06  0.385]]
