In [1]:
# LiveIt Platform Ensemble Classifier Script #

# import statements
import numpy as np
import pandas as pd
import torch
import transformers as ppb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection
import nltk
import os
import pandas as pd
from numpy import arange
import csv
import re
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn import svm

In [2]:
# function for creating more features
def createDataset(comments):

# dataset = [['comment_id', 'comment_length', 'punct_score', 'chars_per_word', 'class']]

    dataset = []

    for num in range(len(comments)):
        word_count = 0
        char_count = 0
        punct_list = [0, 0, 0]
    
        for word in comments.iloc[num].split(' '):
        
            word_count += 1
        
            for char in word:
            
                char_count += 1
            
                if char == '.':
                    punct_list[0] += 1
                elif char == '!':
                    punct_list[1] += 1
                elif char == '?':
                    punct_list[2] += 1
        chars_per_word = char_count / word_count
    
        # A high punct_score means that there are a lot of periods, not many question
        # marks or exclaimations.
        punct_score = punct_list[0] / (punct_list[1] + punct_list[2] + 0.1)
        dataset.append([num, len(comments.iloc[num]), punct_score, chars_per_word])
    
    return pd.DataFrame(dataset)

In [3]:
# function for predicting on newly input comments
def predict(model, lr_clf, svc, poly_svc, post):

    post_list = [post]
    post_series = pd.Series(post_list)
    
    tokenized = post_series.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
    
    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)
            
    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
    
    attention_mask = np.where(padded != 0, 1, 0)
    
    input_ids = torch.tensor(padded)
    attention_mask = torch.tensor(attention_mask)
    
    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)
        
    post_berted = last_hidden_states[0][:,0,:].numpy()
    
    post_df = pd.DataFrame(post_list)
    
    post_features = createDataset(post_series)
    
    bert_pred = lr_clf.predict(post_berted)
    
    svc_pred = svc.predict(post_features)
    
    poly_pred = poly_svc.predict(post_features)
    
    score = 0
    
    if bert_pred == 1:
        score += 2.1
    if svc_pred == 1:
        score += 1
    if poly_pred == 1:
        score += 1
    if score > 1.9:
        return 'Exemplary Mentorship!'
    else:
        return 'Average Mentorship'

In [4]:
# read in and create the dataset
good = pd.read_csv('/storage/home/tum224/exemplary_comments.csv')
bad_messages = pd.read_csv('/storage/home/tum224/average_comments.csv')
good = good.rename(columns={'comment_body': 'text', 'Unnamed: 0': 'class'})
bad_messages = bad_messages.rename(columns={'comment_body': 'text', 'Unnamed: 0': 'class'})

# take a random sample of the massive dataset for resource purposes -- INCREASE WITH MORE RESOURCES
good = good.sample(n=400, random_state=1)
bad_messages = bad_messages.sample(n=400, random_state=1)

In [5]:
# create a dataframe of messages
# create the text field of the dataframe
good = list(good.text)
bad_messages = list(bad_messages.text)
good_messages = good * 5

# create the class field of the dataframe
# label 1 for good mentorship, 0 for bad
labels = []

for i in range(len(good_messages)):
    labels.append(1)

for i in range(len(bad_messages)):
    labels.append(0)

# create the dataframe
data = pd.DataFrame()
data['text'] = good_messages + bad_messages
data['class'] = labels
X = data['text']
y = data['class']

In [6]:
# establish the svm models
# manipulate the dataframe's data to create a dataset fit for svm
X_data = createDataset(X)
X = data['text']

# set the svm models
C = 1 # SVM regularization parameter
svc = svm.SVC(kernel='linear', C=C)
poly_svc = svm.SVC(kernel='poly', degree=3, C=C)

In [7]:
# establish the BERT model
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

# apply the tokenizer to generate BERT features -- adapted from state-of-the-art BERT model
tokenized = data['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

attention_mask = np.where(padded != 0, 1, 0)

input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)


with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

features = last_hidden_states[0][:,0,:].numpy()

# fit the model now with BERT features
lr_clf = LogisticRegression(max_iter=500)

In [8]:
# output the results of cross validation on the models
print('5-fold cross validation:\n')

labels = ['SVC', 'Poly SVC']
for svc, label in zip([svc, poly_svc], labels):
    scores = model_selection.cross_val_score(svc, X_data, y, cv = 5, scoring = 'accuracy')
    print("Accuracy: %0.3f [%s]" % (scores.mean(), label))

score = model_selection.cross_val_score(lr_clf, features, y, cv = 5, scoring = 'accuracy')
print("Accuracy: %0.3f [%s]" % (score.mean(), 'BERT'))

5-fold cross validation:

Accuracy: 0.901 [SVC]
Accuracy: 0.910 [Poly SVC]
Accuracy: 0.908 [BERT]


In [9]:
# since done validation results, make actual predictions with svm
# create the train-test data for the svms
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['class'])
X_train_data = createDataset(X_train)
X_test_data = createDataset(X_test)
X_data = createDataset(X)

# create and fit the models
svc = svm.SVC(kernel='linear', C=C).fit(X_train_data, list(y_train))
poly_svc = svm.SVC(kernel='poly', degree=3, C=C).fit(X_train_data, y_train)

# make predictions based on the models
svc_y_pred = svc.predict(X_test_data)
poly_y_pred = poly_svc.predict(X_test_data)

# reset the dataframe for BERT
data = pd.DataFrame()
data['text'] = X_train
data['class'] = y_train

In [10]:
# implement the actual BERT model
# change the training set
model = model_class.from_pretrained(pretrained_weights)
tokenized = data['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)
features = last_hidden_states[0][:,0,:].numpy()

# fit the BERT model
lr_clf = LogisticRegression(max_iter=500)
lr_clf.fit(features, y_train)

# change the testing set
tokenized = X_test.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)
X_test_features = last_hidden_states[0][:,0,:].numpy()

# make the BERT predctions
bert_y_pred = lr_clf.predict(X_test_features)

In [11]:
# use the predctions of past three models to build voting ensemble
ensemble_predictions = []

for num in range(len(bert_y_pred)):
    
    votes = 0
    
    if bert_y_pred[num] == 1:
        votes += 2.1
    if svc_y_pred[num] == 1:
        votes += 1
    if poly_y_pred[num] == 1:
        votes += 1
    if votes > 1:
        ensemble_predictions.append(1)
    else:
        ensemble_predictions.append(0)


#print('Accuracy: ', accuracy_score(ensemble_predictions, y_test))
#print('F1: ', f1_score(ensemble_predictions, y_test))
#print('Precision: ', precision_score(ensemble_predictions, y_test))
#print('Recall: ', recall_score(ensemble_predictions, y_test))

In [12]:
# print out the return measures
print(confusion_matrix(ensemble_predictions, y_test))
print('Accuracy: ', accuracy_score(ensemble_predictions, y_test))
print('F1: ', f1_score(ensemble_predictions, y_test))
print('Precision: ', precision_score(ensemble_predictions, y_test))
print('Recall: ', recall_score(ensemble_predictions, y_test))

NameError: name 'confusion_matrix' is not defined