# Introduction




### Mount Drive and load dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
file_path = "drive/MyDrive/LiU/TDDE16/Project"

In [None]:
import pandas as pd

df = pd.read_csv("drive/MyDrive/LiU/TDDE16/Project/The-Office-Lines-V4.csv")
df.head()

In [None]:
line_lens = [len(line) for line in df["line"]]
sum(line_lens)/len(line_lens)

In [None]:
df["speaker"].value_counts()[:10].plot(kind="bar")

### Imports & Constants

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, cohen_kappa_score
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from sklearn.dummy import DummyClassifier
import pickle

In [None]:
import random

RAND_STATE = 42
random.seed(RAND_STATE)
np.random.seed(seed=RAND_STATE)

### Filter relevant columns and rows

In [None]:
# Pick relevant columns
df = df[["line", "speaker"]]

# Find top 4 characters
top_4_speakers = df["speaker"].value_counts()[:4]
print("The 4 characters with the most lines are:\n", top_4_speakers)
top_4_speakers.plot(kind="bar")
df = df[df["speaker"].isin(top_4_speakers.index)]
print("The only speakers after filtering: ", df["speaker"].unique())


##### Remove lines which contains 5 or less words.

In [None]:
print(df.shape)

(28720, 2)


In [None]:
df = df[df["line"].apply(lambda x: len(x.split()) > 5)]


In [None]:
print(df.shape)

(16418, 2)


In [None]:
from collections import Counter

df_michael = df[df["speaker"] == "Michael"]
#print(df_michael["line"].head(10))
michael_counter = Counter(" ".join(df_michael["line"].str.lower()).split()).most_common(10000)


df_dwight = df[df["speaker"] == "Dwight"]
#print(df_dwight["line"].head(10))
dwight_counter = Counter(" ".join(df_dwight["line"].str.lower()).split()).most_common(3000)

df_jim = df[df["speaker"] == "Jim"]
df_pam = df[df["speaker"] == "Pam"]

jim_counter = Counter(" ".join(df_jim["line"].str.lower()).split()).most_common(3000)
pam_counter = Counter(" ".join(df_pam["line"].str.lower()).split()).most_common(3000)

print([item for item in jim_counter if item[0] == "fax"])
print([item for item in pam_counter if item[0] == "fax"])
print([item for item in michael_counter if item[0] == "fax"])
print([item for item in dwight_counter if item[0] == "fax"])




### Split data into train and test datasets

In [None]:
from sklearn.model_selection import train_test_split

# Split into train and test dataset
x_train, x_test, y_train, y_test = train_test_split(df["line"], df["speaker"], test_size=0.25, random_state=42)

## New stuff

#### Evaluation function

In [None]:
def evaluate_results(preds, true_vals):
  print(classification_report(true_vals, preds))
  print("Cohen kappa score = ", cohen_kappa_score(true_vals, preds))

#### Downsampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=RAND_STATE)

x_train_downsampled, y_train_downsampled = rus.fit_resample(np.asarray(x_train).reshape(-1, 1), y_train)
x_train_downsampled = x_train_downsampled.squeeze()

# Shuffle the rows, since the downsampling makes them sorted on class
df_train_downsampled = pd.DataFrame(columns=["line", "speaker"])
df_train_downsampled["line"] = x_train_downsampled
df_train_downsampled["speaker"] = y_train_downsampled
df_train_downsampled = df_train_downsampled.sample(frac=1)

x_train_downsampled = df_train_downsampled["line"]
y_train_downsampled = df_train_downsampled["speaker"]

In [None]:
y_train_downsampled.shape

(14888,)

### Split training data

Here we split the training data so that we won't overfit when training the meta classifier.

In [None]:
x_train_lg, x_train_sm, y_train_lg, y_train_sm = train_test_split(x_train_downsampled, y_train_downsampled, test_size=0.2, random_state=RAND_STATE)
print(x_train_lg.shape)
print(x_train_sm.shape)

(13399,)
(1489,)


### Baselines

#### Random baseline (stratified)

Random classification according to class distribution.

In [None]:
dummy_clf = DummyClassifier(strategy="stratified", random_state=RAND_STATE)

preds = dummy_clf.fit(x_train, y_train).predict(x_test)

evaluate_results(preds, y_test)


#### Baseline 2

This baseline always predict the most frequent class.

In [None]:
preds = ["Michael"] * len(y_test)

evaluate_results(preds, y_test)

### Numerical Features Classifier

In [None]:
from textblob import TextBlob

def get_sentiment(text):
  return TextBlob(text).sentiment



#### Create features

In [None]:


def get_processed_set(x):
  line_lengths = pd.Series()
  line_sentiments = pd.Series()
  x_new = pd.DataFrame(columns=["length", "polarity"])


  for line in x:
    line_lengths = line_lengths.append(pd.Series(len(line.split())))
    line_sentiments = line_sentiments.append(pd.Series(get_sentiment(line)[0]))



  x_new["length"] = line_lengths
  x_new["polarity"] = line_sentiments

  return x_new


In [None]:
x_train_processed = get_processed_set(x_train_downsampled)
x_train_processed_lg = get_processed_set(x_train_lg)
x_test_processed = get_processed_set(x_test)

#### Plot data points

In [None]:
colors = {'Michael': 'b', 'Dwight': 'r', "Jim": 'purple', "Pam": "green"}


plt.scatter(x_train_processed["length"], x_train_processed["polarity"],
            color=[colors[speaker] for speaker in y_train_downsampled])

#### Fit and tune model

In [None]:
param_grid = {
    "dual": [False],
    "C": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
    "solver": ["newton-cg", "sag", "saga", "lbfgs" ],
    "n_jobs": [-1],
    "random_state": [RAND_STATE]
}

gs_len_sen_lg = GridSearchCV(LogisticRegression(), param_grid)
gs_len_sen = GridSearchCV(LogisticRegression(), param_grid)


gs_len_sen.fit(x_train_processed, y_train_downsampled)
gs_len_sen_lg.fit(x_train_processed_lg, y_train_lg)

print("best params: ", gs_len_sen.best_params_)
print("best params: ", gs_len_sen_lg.best_params_)

#### Save trained model

In [None]:
model_filename = "gs_len_sen_10"
model_filename_lg = "gs_len_sen_lg_10"

output = open(f'{file_path}/{model_filename}.pkl', 'wb')
output_lg = open(f'{file_path}/{model_filename_lg}.pkl', 'wb')
pickle.dump(gs_len_sen, output)
pickle.dump(gs_len_sen_lg, output_lg)


#### Load trained model

In [None]:
model_filename = "gs_len_sen_10"
model_filename_lg = "gs_len_sen_lg_10"

gs_len_sen = pickle.load(open(f"{file_path}/{model_filename}.pkl", 'rb'))
gs_len_sen_lg = pickle.load(open(f"{file_path}/{model_filename_lg}.pkl", 'rb'))

#### Evaluate model

In [None]:
preds = gs_len_sen.predict(x_test_processed)
preds2 = gs_len_sen_lg.predict(x_test_processed)

print("preds with all downsampled data:\n")
print(evaluate_results(preds, y_test))
print("\n\npreds with subset of all downsampled data:\n")
print(evaluate_results(preds2, y_test))

### TFIDF Classifier

#### Perform Grid Search to tune hyperparameters

In [None]:
param_grid = {
    "clf__dual": [False],
    "clf__C": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1 ],
    "clf__solver": ["newton-cg", "sag", "saga", "lbfgs" ],
    "clf__n_jobs": [-1],
    "clf__random_state": [RAND_STATE]
}

tfidf_lg_pipe = Pipeline([("TfidfVectorizer", TfidfVectorizer()),
                          ("clf", LogisticRegression())])


tfidf_lg_pipe_lg = Pipeline([("TfidfVectorizer", TfidfVectorizer()),
                          ("clf", LogisticRegression())])

gs_lg_tfidf = GridSearchCV(tfidf_lg_pipe, param_grid)
gs_lg_tfidf_lg = GridSearchCV(tfidf_lg_pipe_lg, param_grid)

gs_lg_tfidf.fit(x_train_downsampled, y_train_downsampled)
gs_lg_tfidf_lg.fit(x_train_lg, y_train_lg)

print("Best parameters: ", gs_lg_tfidf.best_params_)
print("Best parameters: ", gs_lg_tfidf_lg.best_params_)

#### Save trained model

In [None]:
model_filename = "gs_lg_tfidf_10.pkl"
model_filename_lg = "gs_lg_tfidf_lg_10.pkl"

output = open(f'{file_path}/{model_filename}', 'wb')
output_lg = open(f'{file_path}/{model_filename_lg}', 'wb')
pickle.dump(gs_lg_tfidf, output)
pickle.dump(gs_lg_tfidf_lg, output_lg)

#### Load trained model

In [None]:
model_filename = "gs_lg_tfidf_10.pkl"
model_filename_lg = "gs_lg_tfidf_lg_10.pkl"

gs_lg_tfidf =  pickle.load(open(f"{file_path}/{model_filename}", 'rb'))
gs_lg_tfidf_lg = pickle.load(open(f"{file_path}/{model_filename_lg}", 'rb'))

#### Evaluate classifier

In [None]:
preds = gs_lg_tfidf.predict(x_test)
preds2 = gs_lg_tfidf_lg.predict(x_test)

print("Using all downsampled data:\n")
print("Classification report:\n", classification_report(y_test, preds))
print("Cohen Kappa score: ", cohen_kappa_score(y_test, preds))
disp = ConfusionMatrixDisplay(confusion_matrix(y_test, preds))
print("\n\n Confusion Matrix:\n")
disp.plot()

print("Using a subset of all downsampled data:\n")
print("Classification report:\n", classification_report(y_test, preds2))
print("Cohen Kappa score: ", cohen_kappa_score(y_test, preds2))
disp = ConfusionMatrixDisplay(confusion_matrix(y_test, preds2))
print("\n\n Confusion Matrix:\n")
disp.plot()

print("Classes to index mapping: ", gs_lg_tfidf.classes_)

In [None]:
error_indexes_tfidf = [index  for index, pred_tuple in enumerate(zip(preds, y_test)) if pred_tuple[0] != pred_tuple[1]]

### BERT classifier

In [None]:
!pip install simpletransformers
!pip freeze | grep simpletransformers

In [None]:
from simpletransformers.classification import ClassificationModel
import os

random.seed(RAND_STATE)
np.random.seed(seed=RAND_STATE)

train_args = {
    "reprocess_input_data": True,
  "fp16": False,
  "num_train_epochs": 4,
    "overwrite_output_dir": True,
    "learning_rate": 4e-5,
    "manual_seed": RAND_STATE,
    "use_multiprocessing": False,
    "use_multiprocessing_for_evaluation": False
}

os.environ["TOKENIZERS_PARALLELISM"] = "false"

bert_clf_lg = ClassificationModel("bert", "bert-base-uncased", num_labels=4, args=train_args)
bert_clf = ClassificationModel("bert", "bert-base-uncased", num_labels=4, args=train_args)

In [None]:
labels = {
    "Michael": 0,
    "Dwight": 1,
    "Jim": 2,
    "Pam": 3
}


In [None]:
labels = {
    "Michael": 0,
    "Dwight": 1,
    "Jim": 2,
    "Pam": 3
}

# Map speaker strings to integers
speakers = np.array([])
for speaker in y_train_downsampled:
  speakers = np.append(speakers, labels[speaker])

speakers_lg = np.array([])
for speaker in y_train_lg:
  speakers_lg = np.append(speakers_lg, labels[speaker])

df_train = pd.DataFrame(columns=["line", "speaker"])
df_train["line"] = x_train_downsampled
df_train["speaker"] = speakers

df_train_lg = pd.DataFrame(columns=["line", "speaker"])
df_train_lg["line"] = x_train_lg
df_train_lg["speaker"] = speakers_lg


#### Hyperparameter tuning (not used due to time constraints)

In [None]:
from sklearn.model_selection import KFold

tmp_train = df_train[:20]

# 5-fold is default
kf = KFold()
kf.get_n_splits(tmp_train)

learning_rates = [5e-5, 4e-5, 3e-5, 2e-5]


results = []

for learning_rate in learning_rates:
  train_args = {
    "reprocess_input_data": True,
    "fp16": False,
    "num_train_epochs": 4,
    "overwrite_output_dir": True,
    "learning_rate": learning_rate
  }
  clf = ClassificationModel("bert", "bert-base-uncased", num_labels=4, args=train_args)
  for i, (train_index, test_index) in enumerate(kf.split(tmp_train)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")
    cv_train_set = tmp_train.iloc[train_index]
    cv_val_set = tmp_train.iloc[test_index]
    clf.train_model(cv_train_set)
    result, model_outputs, wrong_predictions = clf.eval_model(cv_val_set, f1=f1_multiclass, acc=accuracy_score)
    results.append(result)




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Fold 0:
  Train: index=[ 4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
  Test:  index=[0 1 2 3]


  0%|          | 0/16 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Fold 1:
  Train: index=[ 0  1  2  3  8  9 10 11 12 13 14 15 16 17 18 19]
  Test:  index=[4 5 6 7]




  0%|          | 0/16 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Fold 2:
  Train: index=[ 0  1  2  3  4  5  6  7 12 13 14 15 16 17 18 19]
  Test:  index=[ 8  9 10 11]




  0%|          | 0/16 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Fold 3:
  Train: index=[ 0  1  2  3  4  5  6  7  8  9 10 11 16 17 18 19]
  Test:  index=[12 13 14 15]




  0%|          | 0/16 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Fold 4:
  Train: index=[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
  Test:  index=[16 17 18 19]




  0%|          | 0/16 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Fold 0:
  Train: index=[ 4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
  Test:  index=[0 1 2 3]




  0%|          | 0/16 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Fold 1:
  Train: index=[ 0  1  2  3  8  9 10 11 12 13 14 15 16 17 18 19]
  Test:  index=[4 5 6 7]




  0%|          | 0/16 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Fold 2:
  Train: index=[ 0  1  2  3  4  5  6  7 12 13 14 15 16 17 18 19]
  Test:  index=[ 8  9 10 11]




  0%|          | 0/16 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Fold 3:
  Train: index=[ 0  1  2  3  4  5  6  7  8  9 10 11 16 17 18 19]
  Test:  index=[12 13 14 15]




  0%|          | 0/16 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Fold 4:
  Train: index=[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
  Test:  index=[16 17 18 19]




  0%|          | 0/16 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Fold 0:
  Train: index=[ 4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
  Test:  index=[0 1 2 3]




  0%|          | 0/16 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Fold 1:
  Train: index=[ 0  1  2  3  8  9 10 11 12 13 14 15 16 17 18 19]
  Test:  index=[4 5 6 7]




  0%|          | 0/16 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Fold 2:
  Train: index=[ 0  1  2  3  4  5  6  7 12 13 14 15 16 17 18 19]
  Test:  index=[ 8  9 10 11]




  0%|          | 0/16 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Fold 3:
  Train: index=[ 0  1  2  3  4  5  6  7  8  9 10 11 16 17 18 19]
  Test:  index=[12 13 14 15]




  0%|          | 0/16 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Fold 4:
  Train: index=[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
  Test:  index=[16 17 18 19]




  0%|          | 0/16 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Fold 0:
  Train: index=[ 4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
  Test:  index=[0 1 2 3]




  0%|          | 0/16 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Fold 1:
  Train: index=[ 0  1  2  3  8  9 10 11 12 13 14 15 16 17 18 19]
  Test:  index=[4 5 6 7]




  0%|          | 0/16 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Fold 2:
  Train: index=[ 0  1  2  3  4  5  6  7 12 13 14 15 16 17 18 19]
  Test:  index=[ 8  9 10 11]




  0%|          | 0/16 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Fold 3:
  Train: index=[ 0  1  2  3  4  5  6  7  8  9 10 11 16 17 18 19]
  Test:  index=[12 13 14 15]




  0%|          | 0/16 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

Fold 4:
  Train: index=[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
  Test:  index=[16 17 18 19]




  0%|          | 0/16 [00:00<?, ?it/s]

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Running Epoch 0 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 2 of 4:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 3 of 4:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:

avgs = []
for i in range(4):
  tmp_res = []
  for j in range(5):
    res = results[5*i + j]['eval_loss']
    tmp_res.append(res)
  avgs.append(sum(tmp_res)/len(tmp_res))
print(avgs)

[0.793445348739624, 0.7600388109683991, 0.8500930786132812, 1.0260414958000184]


#### Train bert model on all training data.

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
random.seed(RAND_STATE)
np.random.seed(seed=RAND_STATE)

bert_clf.train_model(df_train)

#### Save model

In [None]:
model_filename = "bert_10.pkl"
output = open(f'{file_path}/{model_filename}', 'wb')
pickle.dump(bert_clf, output)

#### Load model trained on all training data

In [None]:
model_filename = "bert_10.pkl"

bert_clf =  pickle.load(open(f"{file_path}/{model_filename}", 'rb'))

#### Train bert model on subset of training data.

In [None]:
random.seed(RAND_STATE)
np.random.seed(seed=RAND_STATE)

bert_clf_lg.train_model(df_train_lg)

#### Save model

In [None]:
model_filename_lg = "bert_lg_10.pkl"
output_lg = open(f'{file_path}/{model_filename_lg}', 'wb')
pickle.dump(bert_clf_lg, output_lg)

#### Load model trained on subset of training data.

In [None]:
model_filename_lg = "bert_lg_10.pkl"

bert_clf_lg =  pickle.load(open(f"{file_path}/{model_filename_lg}", 'rb'))

In [None]:
random.seed(RAND_STATE)
np.random.seed(seed=RAND_STATE)


In [None]:
from sklearn.metrics import f1_score, accuracy_score

def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')

In [None]:
df_test = pd.DataFrame(columns=["line", "speaker"])

speakers = np.array([])
for speaker in y_test:
  speakers = np.append(speakers, labels[speaker])

df_test["line"] = x_test
df_test["speaker"] = speakers

result, model_outputs, wrong_predictions = bert_clf_lg.eval_model(df_test, f1=f1_multiclass, acc=accuracy_score)






Running Evaluation:   0%|          | 0/898 [00:00<?, ?it/s]

In [None]:
preds = []
for output in model_outputs:
    preds.append(output.argmax())  

In [None]:
from sklearn.metrics import classification_report, cohen_kappa_score

print(f"Result: {result}")
print("Evaluation report:\n", classification_report(list(df_test["speaker"]), preds))
print("\nCohen Kappa score = ", cohen_kappa_score(list(df_test["speaker"]), preds))

In [None]:
random.seed(RAND_STATE)
np.random.seed(seed=RAND_STATE)

error_indexes_bert = [index  for index, pred_tuple in enumerate(zip(preds, list(df_test["speaker"]))) if pred_tuple[0] != pred_tuple[1]]

print("intersect bert and tfidf error indexes: ", len(set(error_indexes_bert).intersection(set(error_indexes_tfidf))))
print("total error len tfidif: ", len(error_indexes_tfidf))
print("total error len BERT: ", len(error_indexes_bert))
print("---------------- 20 random missclassified lines ----------------")
indexes = list(set(error_indexes_bert).intersection(set(error_indexes_tfidf)))
random.shuffle(indexes)
missclassified_lines = [(x_test.iloc[i], y_test.iloc[i]) for i in indexes[:20]]

print(missclassified_lines)




### Ensamble classifier

In [None]:
from scipy.special import softmax

class EnsambleClassifier:
  """
  This class trains the meta classifier of this ensamble. It takes three
  different classifiers, which are all pre-trained. It uses the probability output
  from each of these to perform a classification using logistic regression.
  """

  def __init__(self, num_clf, tfidf_clf, bert_clf, meta_clf):
    self.num_clf = num_clf
    self.tfidf_clf = tfidf_clf
    self.bert_clf = bert_clf

    param_grid = {
      "dual": [False],
      "C": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
      "solver": ["newton-cg", "sag", "saga", "lbfgs" ],
      "n_jobs": [-1],
      "random_state": [RAND_STATE]
    }

    self.gs_meta_clf = GridSearchCV(meta_clf, param_grid=param_grid)

  def set_base_classifiers(self, classifiers):
    self.num_clf = classifiers[0]
    self.tfidf_clf = classifiers[1]
    self.bert_clf = classifiers[2]

  def train(self, x_train, y_train):
    df_x = self.__get_base_class_probs(x_train)
    
    self.gs_meta_clf.fit(df_x, y_train)
    print("Best meta classifier params: ", self.gs_meta_clf.best_params_)
    print("Meta classifier hyperparameter tuning and training done.")

  def predict(self, x):
    df_x = self.__get_base_class_probs(x)
    
    # Get final predictions from meta classifier
    self.preds = self.gs_meta_clf.predict(df_x)
    return self.preds

  def __get_base_class_probs(self, x):
    # Create special features
    x_processed = get_processed_set(x)

    # Get probabilities from all base classifiers
    probs_num = self.num_clf.predict_proba(x_processed)
    probs_tfidf = self.tfidf_clf.predict_proba(x)
    predictions, raw_output = self.bert_clf.predict(list(x))
    probs_bert = []
    for row in raw_output:
      probs_bert.append(softmax(row))


    # Create new x matrix with all probabilities as features
    df_x = self.__get_df_with_proba_features(probs_num, probs_tfidf, probs_bert)

    return df_x

  def __get_df_with_proba_features(self, probs_num, probs_tfidf, probs_bert):

    x_with_proba = pd.DataFrame(columns=["D1", "J1", "M1", "P1",
                                         "D2", "J2", "M2", "P2", 
                                         "D3", "J3", "M3", "P3"])
                                         #"line_len", "polarity"])
    for i in range(len(probs_tfidf)):
      new_row = {
              "D1": probs_tfidf[i][0],
              "J1": probs_tfidf[i][1],
              "M1": probs_tfidf[i][2],
              "P1": probs_tfidf[i][3],
              "D2": probs_num[i][0],
              "J2": probs_num[i][1],
              "M2": probs_num[i][2],
              "P2": probs_num[i][3],
              "D3": probs_bert[i][1],
              "J3": probs_bert[i][2],
              "M3": probs_bert[i][0],
              "P3": probs_bert[i][3],
              #"line_len": x_processed.iloc[i]["length"],
              #"polarity": x_processed.iloc[i]["polarity"]
               }

      x_with_proba = x_with_proba.append(new_row, ignore_index=True)

    return x_with_proba


In [None]:
random.seed(RAND_STATE)
np.random.seed(seed=RAND_STATE)

clf = EnsambleClassifier(gs_len_sen_lg, gs_lg_tfidf_lg, bert_clf_lg, LogisticRegression())

clf.train(x_train_sm, y_train_sm)


In [None]:
clf.set_base_classifiers([gs_len_sen, gs_lg_tfidf, bert_clf])

In [None]:
preds = clf.predict(x_test)

  line_lengths = pd.Series()
  line_sentiments = pd.Series()


  0%|          | 0/898 [00:00<?, ?it/s]

### Results

In [None]:
print("preds:\n", preds)

print("Classification report:\n", classification_report(y_test, preds))
print("\nCohen Kappa score = ", cohen_kappa_score(y_test, preds))

#### Error analysis on results

In [None]:
from collections import Counter

random.seed(RAND_STATE)
np.random.seed(seed=RAND_STATE)

error_indexes = [index  for index, pred_tuple in enumerate(zip(preds, y_test)) if pred_tuple[0] != pred_tuple[1]]
correct_indexes = [index  for index, pred_tuple in enumerate(zip(preds, y_test)) if pred_tuple[0] == pred_tuple[1]]
random.shuffle(error_indexes)
print("---------------- 20 random missclassified lines ----------------")
missclassified_lines = [(x_test.iloc[i], y_test.iloc[i], preds[i]) for i in error_indexes[:21]]
print(missclassified_lines)

print("---------------- 20 random correctly classified lines ----------")

correct_lines = [(x_test.iloc[i], y_test.iloc[i]) for i in correct_indexes[:20]]
print(correct_lines)

line_lens_missclassified = [len(x_test.iloc[index]) for index in error_indexes]
avg_line_missclassified = sum(line_lens_missclassified)/len(line_lens_missclassified)

line_lens_correct = [len(x_test.iloc[index]) for index in correct_indexes]
avg_line_correct = sum(line_lens_correct)/len(line_lens_correct)

print("Average line length of missclassified: ", avg_line_missclassified)
print("Average line length of correctly classified: ", avg_line_correct)
line_lens_missclassified.sort(reverse=True)
line_lens_correct.sort(reverse=True)

print(line_lens_missclassified)
print(line_lens_correct)
