**1. Import libraries**

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import svm
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import cross_val_score

**2. Load data**

In [None]:
url = "https://raw.githubusercontent.com/taegyoon-kim/Dissertation-Essay-1/master/training_sep21.csv"

df = pd.read_csv(url, error_bad_lines=False)

df['text'] = df['status_final_text']
df['threat'] = df['final_binary'].astype(float)
df = df[['text','threat']]

df = df.sample(frac=1).reset_index(drop=True)

**3. Evaluation metrics**

In [None]:
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

def report_results(A, B):
    #A_name = A.name
    #B_name = B.name
    
    df = pd.DataFrame({'A':A,
                       'B':B})
    df = df.dropna()
    A = df['A']
    B = df['B']
    
    acc = accuracy_score(B, A)
    f1 = f1_score(B, A)
    prec = precision_score(B, A)
    rec = recall_score(B, A)
    
    #print('accuracy: %0.4f \nprecision: %0.4f \nrecall: %0.4f \nF1 score: %0.4f' % (acc, prec, rec, f1))

    performance = [acc, prec, rec, f1]

    return performance

#scoring = {'accuracy' : make_scorer(accuracy_score), 'precision' : make_scorer(precision_score), 'recall' : make_scorer(recall_score), 'f1_score' : make_scorer(f1_score)}

**3. Logistic regression + Count Vector & TF-IDF Vector & Word Embeddings**

In [None]:
###############################################################


count = CountVectorizer(ngram_range = (1,2), binary = True, lowercase = True)
train_val_X = count.fit_transform(df['text'])
train_val_y = df['threat']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 1)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  
  train_X = train_val_X[train_index,:]
  val_X = train_val_X[val_index,:]

  train_y = train_val_y[train_index]
  val_y = train_val_y[val_index]

  lr = LogisticRegression(C=1, random_state=7, solver='sag', max_iter=2000, n_jobs=-1)
  lr.fit(train_X, train_y)
  pred_y = lr.predict(val_X)
  
  lr_performance = report_results(pred_y, val_y)
  cv_acc.append(lr_performance[0])
  cv_pre.append(lr_performance[1])
  cv_rec.append(lr_performance[2])
  cv_f1.append(lr_performance[3])

print('\nLR + count')
print('- acc:', round(np.mean(cv_acc)*100,2))
print('- pre:', round(np.mean(cv_pre)*100,2))
print('- rec:', round(np.mean(cv_rec)*100,2))
print('- f1:', round(np.mean(cv_f1)*100,2))


###############################################################


tfidf = TfidfVectorizer(ngram_range = (1,2), lowercase = True)
train_val_X = tfidf.fit_transform(df['text'])
train_val_y = df['threat']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 1)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  
  train_X = train_val_X[train_index,:]
  val_X = train_val_X[val_index,:]

  train_y = train_val_y[train_index]
  val_y = train_val_y[val_index]

  lr = LogisticRegression(C=1, random_state=7, solver='sag', max_iter=2000, n_jobs=-1)
  lr.fit(train_X, train_y)
  pred_y = lr.predict(val_X)
  
  lr_performance = report_results(pred_y, val_y)
  cv_acc.append(lr_performance[0])
  cv_pre.append(lr_performance[1])
  cv_rec.append(lr_performance[2])
  cv_f1.append(lr_performance[3])

print('\nLR + tfidf')
print('- acc:', round(np.mean(cv_acc)*100,2))
print('- pre:', round(np.mean(cv_pre)*100,2))
print('- rec:', round(np.mean(cv_rec)*100,2))
print('- f1:', round(np.mean(cv_f1)*100,2))


###############################################################


from tqdm import tqdm
import nltk
from nltk import word_tokenize
from nltk import punkt
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')

embeddings_index = {}
f = open('/content/drive/My Drive/diss_detection/python_scripts/glove/glove.6B.200d.txt', encoding="utf8")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
       coefs = np.asarray(values[1:], dtype='float32')
       embeddings_index[word] = coefs
    except ValueError:
       pass
f.close()


def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(200)
    return v / np.sqrt((v ** 2).sum())

for data in data_sets:
  glove_X = [sent2vec(x) for x in tqdm(data["text"])]
  glove_X = np.array(glove_X)
  y = df['threat']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 1)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  train_X = glove_X[train_index,:]
  val_X = glove_X[val_index,:]

  train_y = y[train_index]
  val_y = y[val_index]

  lr = LogisticRegression(C=1, random_state=7, solver='sag', max_iter=2000, n_jobs=-1)
  lr.fit(train_X, train_y)
  pred_y = lr.predict(val_X)
  
  lr_performance = report_results(pred_y, val_y)
  cv_acc.append(lr_performance[0])
  cv_pre.append(lr_performance[1])
  cv_rec.append(lr_performance[2])
  cv_f1.append(lr_performance[3])

print('\nLR + glove_200d')
print('- acc:', round(np.mean(cv_acc)*100,2))
print('- pre:', round(np.mean(cv_pre)*100,2))
print('- rec:', round(np.mean(cv_rec)*100,2))
print('- f1:', round(np.mean(cv_f1)*100,2))

In [None]:
############################ RF ############################


count = CountVectorizer(ngram_range = (1,2), binary = True, lowercase = True)
train_val_X = count.fit_transform(df['text'])
train_val_y = df['threat']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 1)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  
  train_X = train_val_X[train_index,:]
  val_X = train_val_X[val_index,:]

  train_y = train_val_y[train_index]
  val_y = train_val_y[val_index]

  rf = RandomForestClassifier(n_estimators=500, random_state=0)
  rf.fit(train_X, train_y)
  pred_y = rf.predict(val_X)
  
  rf_performance = report_results(pred_y, val_y)
  cv_acc.append(rf_performance[0])
  cv_pre.append(rf_performance[1])
  cv_rec.append(rf_performance[2])
  cv_f1.append(rf_performance[3])

print('\nRF + count')
print('- acc:', round(np.mean(cv_acc)*100,2))
print('- pre:', round(np.mean(cv_pre)*100,2))
print('- rec:', round(np.mean(cv_rec)*100,2))
print('- f1:', round(np.mean(cv_f1)*100,2))


###############################################################


tfidf = TfidfVectorizer(ngram_range = (1,2), lowercase = True)
train_val_X = tfidf.fit_transform(df['text'])
train_val_y = df['threat']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 1)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  
  train_X = train_val_X[train_index,:]
  val_X = train_val_X[val_index,:]

  train_y = train_val_y[train_index]
  val_y = train_val_y[val_index]

  rf = RandomForestClassifier(n_estimators=500, random_state=0)
  rf.fit(train_X, train_y)
  pred_y = rf.predict(val_X)
  
  rf_performance = report_results(pred_y, val_y)
  cv_acc.append(rf_performance[0])
  cv_pre.append(rf_performance[1])
  cv_rec.append(rf_performance[2])
  cv_f1.append(rf_performance[3])

print('\nRF + tfidf')
print('- acc:', round(np.mean(cv_acc)*100,2))
print('- pre:', round(np.mean(cv_pre)*100,2))
print('- rec:', round(np.mean(cv_rec)*100,2))
print('- f1:', round(np.mean(cv_f1)*100,2))


###############################################################


from tqdm import tqdm
import nltk
from nltk import word_tokenize
from nltk import punkt
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')

embeddings_index = {}
f = open('/content/drive/My Drive/diss_detection/python_scripts/glove/glove.6B.200d.txt', encoding="utf8")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
       coefs = np.asarray(values[1:], dtype='float32')
       embeddings_index[word] = coefs
    except ValueError:
       pass
f.close()


def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(200)
    return v / np.sqrt((v ** 2).sum())

for data in data_sets:
  glove_X = [sent2vec(x) for x in tqdm(data["text"])]
  glove_X = np.array(glove_X)
  y = df['threat']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 1)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  train_X = glove_X[train_index,:]
  val_X = glove_X[val_index,:]

  train_y = y[train_index]
  val_y = y[val_index]

  rf = RandomForestClassifier(n_estimators=500, random_state=0)
  rf.fit(train_X, train_y)
  pred_y = rf.predict(val_X)
  
  rf_performance = report_results(pred_y, val_y)
  cv_acc.append(rf_performance[0])
  cv_pre.append(rf_performance[1])
  cv_rec.append(rf_performance[2])
  cv_f1.append(rf_performance[3])

print('\nRF + glove_100d')
print('- acc:', round(np.mean(cv_acc)*100,2))
print('- pre:', round(np.mean(cv_pre)*100,2))
print('- rec:', round(np.mean(cv_rec)*100,2))
print('- f1:', round(np.mean(cv_f1)*100,2))

In [None]:
###############################################################XGB


count = CountVectorizer(ngram_range = (1,2), binary = True, lowercase = True)
train_val_X = count.fit_transform(df['text'])
train_val_y = df['threat']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 1)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  
  train_X = train_val_X[train_index,:]
  val_X = train_val_X[val_index,:]

  train_y = train_val_y[train_index]
  val_y = train_val_y[val_index]

  xgb = XGBClassifier()
  xgb.fit(train_X, train_y)
  pred_y = xgb.predict(val_X)
  
  xgb_performance = report_results(pred_y, val_y)
  cv_acc.append(xgb_performance[0])
  cv_pre.append(xgb_performance[1])
  cv_rec.append(xgb_performance[2])
  cv_f1.append(xgb_performance[3])

print('\nXGB + count')
print('- acc:', round(np.mean(cv_acc)*100,2))
print('- pre:', round(np.mean(cv_pre)*100,2))
print('- rec:', round(np.mean(cv_rec)*100,2))
print('- f1:', round(np.mean(cv_f1)*100,2))


###############################################################


tfidf = TfidfVectorizer(ngram_range = (1,2), lowercase = True)
train_val_X = tfidf.fit_transform(df['text'])
train_val_y = df['threat']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 1)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  
  train_X = train_val_X[train_index,:]
  val_X = train_val_X[val_index,:]

  train_y = train_val_y[train_index]
  val_y = train_val_y[val_index]

  xgb = XGBClassifier()
  xgb.fit(train_X, train_y)
  pred_y = xgb.predict(val_X)
  
  xgb_performance = report_results(pred_y, val_y)
  cv_acc.append(xgb_performance[0])
  cv_pre.append(xgb_performance[1])
  cv_rec.append(xgb_performance[2])
  cv_f1.append(xgb_performance[3])

print('\nXGB + tfidf')
print('- acc:', round(np.mean(cv_acc)*100,2))
print('- pre:', round(np.mean(cv_pre)*100,2))
print('- rec:', round(np.mean(cv_rec)*100,2))
print('- f1:', round(np.mean(cv_f1)*100,2))


###############################################################


from tqdm import tqdm
import nltk
from nltk import word_tokenize
from nltk import punkt
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')

embeddings_index = {}
f = open('/content/drive/My Drive/diss_detection/python_scripts/glove/glove.6B.200d.txt', encoding="utf8")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
       coefs = np.asarray(values[1:], dtype='float32')
       embeddings_index[word] = coefs
    except ValueError:
       pass
f.close()


def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(200)
    return v / np.sqrt((v ** 2).sum())

for data in data_sets:
  glove_X = [sent2vec(x) for x in tqdm(data["text"])]
  glove_X = np.array(glove_X)
  y = df['threat']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 1)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  train_X = glove_X[train_index,:]
  val_X = glove_X[val_index,:]

  train_y = y[train_index]
  val_y = y[val_index]

  xgb = XGBClassifier()
  xgb.fit(train_X, train_y)
  pred_y = xgb.predict(val_X)
  
  xgb_performance = report_results(pred_y, val_y)
  cv_acc.append(xgb_performance[0])
  cv_pre.append(xgb_performance[1])
  cv_rec.append(xgb_performance[2])
  cv_f1.append(xgb_performance[3])

print('\nXGB + glove_200d')
print('- acc:', round(np.mean(cv_acc)*100,2))
print('- pre:', round(np.mean(cv_pre)*100,2))
print('- rec:', round(np.mean(cv_rec)*100,2))
print('- f1:', round(np.mean(cv_f1)*100,2))

In [None]:
# count

for data in data_sets:
  Count = CountVectorizer(ngram_range = (1,2), binary = True, lowercase = True)
  Count_X = Count.fit_transform(data["text"])
  y = data['threat']
  lr_results = model_selection.cross_validate(estimator=lr,
                                              X=Count_X,
                                              y=y,
                                              cv=kfold,
                                              scoring=scoring)
  print('cross-validation reports (K=5) for logistic regression + count','\n', 
        'accuracy:', round(100*lr_results['test_accuracy'].mean(),2),'/ Std:',round(np.std(lr_results['test_accuracy']), 2),'\n', 
        'precision:', round(100*lr_results['test_precision'].mean(), 2),'/ Std:',round(np.std(lr_results['test_precision']), 2),'\n', 
        'recall:', round(100*lr_results['test_recall'].mean(), 2),'/ Std:',round(np.std(lr_results['test_recall']), 2),'\n', 
        'F-1:', round(100*lr_results['test_f1_score'].mean(), 2), '/ Std:',round(np.std(lr_results['test_f1_score']), 2))
  

# TF-IDF

for data in data_sets:
  Tfidf = TfidfVectorizer(ngram_range = (1,2), lowercase = True)
  tfidf_X = Tfidf.fit_transform(data["text"])
  y = data['threat']
  lr_results = model_selection.cross_validate(estimator=lr,
                                              X=tfidf_X,
                                              y=y,
                                              cv=kfold,
                                              scoring=scoring)
  print('cross-validation reports (K=5) for logistic regression + TF-IDF','\n', 
        'accuracy:', round(100*lr_results['test_accuracy'].mean(),2),'/ Std:',round(np.std(lr_results['test_accuracy']), 2),'\n', 
        'precision:', round(100*lr_results['test_precision'].mean(), 2),'/ Std:',round(np.std(lr_results['test_precision']), 2),'\n', 
        'recall:', round(100*lr_results['test_recall'].mean(), 2),'/ Std:',round(np.std(lr_results['test_recall']), 2),'\n', 
        'F-1:', round(100*lr_results['test_f1_score'].mean(), 2), '/ Std:',round(np.std(lr_results['test_f1_score']), 2))


# GloVe

from tqdm import tqdm
import nltk
from nltk import word_tokenize
from nltk import punkt
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')

embeddings_index = {}
f = open('/content/drive/My Drive/diss_detection/python_scripts/glove/glove.6B.200d.txt', encoding="utf8")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
       coefs = np.asarray(values[1:], dtype='float32')
       embeddings_index[word] = coefs
    except ValueError:
       pass
f.close()


def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(200)
    return v / np.sqrt((v ** 2).sum())

for data in data_sets:
  glove_X = [sent2vec(x) for x in tqdm(data["text"])]
  glove_X = np.array(glove_X)
  y = data['threat']
  lr_results = model_selection.cross_validate(estimator=lr,
                                              X=glove_X,
                                              y=y,
                                              cv=kfold,
                                              scoring=scoring)
  print('cross-validation reports (K=5) for logistic regression + GloVe','\n', 
        'accuracy:', round(100*lr_results['test_accuracy'].mean(),2),'/ Std:',round(np.std(lr_results['test_accuracy']), 2),'\n', 
        'precision:', round(100*lr_results['test_precision'].mean(), 2),'/ Std:',round(np.std(lr_results['test_precision']), 2),'\n', 
        'recall:', round(100*lr_results['test_recall'].mean(), 2),'/ Std:',round(np.std(lr_results['test_recall']), 2),'\n', 
        'F-1:', round(100*lr_results['test_f1_score'].mean(), 2), '/ Std:',round(np.std(lr_results['test_f1_score']), 2))



**4. Random Forest + Count Vector & TF-IDF Vector & Word Embeddings**

In [None]:
kfold = model_selection.KFold(n_splits=5, random_state=42, shuffle = True)
rf = RandomForestClassifier(n_estimators=500, random_state=0)
data_sets = [df]

# count
for data in data_sets:
  Count = CountVectorizer(ngram_range = (1,2), binary = True, lowercase = True)
  Count_X = Count.fit_transform(data["text"])
  y = data['threat']
  rf_results = model_selection.cross_validate(estimator=rf,
                                              X=Count_X,
                                              y=y,
                                              cv=kfold,
                                              scoring=scoring)
  print('cross-validation reports (K=5) for random forest + count','\n', 
        'accuracy:', round(100*rf_results['test_accuracy'].mean(),2),'/ Std:',round(np.std(rf_results['test_accuracy']), 2),'\n', 
        'precision:', round(100*rf_results['test_precision'].mean(), 2),'/ Std:',round(np.std(rf_results['test_precision']), 2),'\n', 
        'recall:', round(100*rf_results['test_recall'].mean(), 2),'/ Std:',round(np.std(rf_results['test_recall']), 2),'\n', 
        'F-1:', round(100*rf_results['test_f1_score'].mean(), 2), '/ Std:',round(np.std(rf_results['test_f1_score']), 2))
  

# TF-IDF
for data in data_sets:
  Tfidf = TfidfVectorizer(ngram_range = (1,2), lowercase = True)
  tfidf_X = Tfidf.fit_transform(data["text"])
  y = data['threat']
  rf_results = model_selection.cross_validate(estimator=rf,
                                              X=tfidf_X,
                                              y=y,
                                              cv=kfold,
                                              scoring=scoring)
  print('cross-validation reports (K=5) for random forest + TF-IDF','\n', 
        'accuracy:', round(100*rf_results['test_accuracy'].mean(),2),'/ Std:',round(np.std(rf_results['test_accuracy']), 2),'\n', 
        'precision:', round(100*rf_results['test_precision'].mean(), 2),'/ Std:',round(np.std(rf_results['test_precision']), 2),'\n', 
        'recall:', round(100*rf_results['test_recall'].mean(), 2),'/ Std:',round(np.std(rf_results['test_recall']), 2),'\n', 
        'F-1:', round(100*rf_results['test_f1_score'].mean(), 2), '/ Std:',round(np.std(rf_results['test_f1_score']), 2))
  

# GloVe

from tqdm import tqdm
import nltk
from nltk import word_tokenize
from nltk import punkt
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')


embeddings_index = {}
f = open('/content/drive/My Drive/diss_detection/python_scripts/glove/glove.6B.200d.txt', encoding="utf8")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
       coefs = np.asarray(values[1:], dtype='float32')
       embeddings_index[word] = coefs
    except ValueError:
       pass
f.close()

def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(200)
    return v / np.sqrt((v ** 2).sum())

for data in data_sets:
  glove_X = [sent2vec(x) for x in tqdm(data["text"])]
  glove_X = np.array(glove_X)
  y = data['threat']
  lr_results = model_selection.cross_validate(estimator=lr,
                                              X=glove_X,
                                              y=y,
                                              cv=kfold,
                                              scoring=scoring)
  print('cross-validation reports (K=5) for logistic regression + GloVe','\n', 
        'accuracy:', round(100*lr_results['test_accuracy'].mean(),2),'/ Std:',round(np.std(lr_results['test_accuracy']), 2),'\n', 
        'precision:', round(100*lr_results['test_precision'].mean(), 2),'/ Std:',round(np.std(lr_results['test_precision']), 2),'\n', 
        'recall:', round(100*lr_results['test_recall'].mean(), 2),'/ Std:',round(np.std(lr_results['test_recall']), 2),'\n', 
        'F-1:', round(100*lr_results['test_f1_score'].mean(), 2), '/ Std:',round(np.std(lr_results['test_f1_score']), 2))


# GloVe

from tqdm import tqdm
import nltk
from nltk import word_tokenize
from nltk import punkt
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')


embeddings_index = {}
f = open('/content/drive/My Drive/glove_twitter/glove.twitter.27B.100d.txt', encoding="utf8")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
       coefs = np.asarray(values[1:], dtype='float32')
       embeddings_index[word] = coefs
    except ValueError:
       pass
f.close()


def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(100)
    return v / np.sqrt((v ** 2).sum())

for data in data_sets:
  glove_X = [sent2vec(x) for x in tqdm(data["text"])]
  glove_X = np.array(glove_X)
  y = data['threat']
  rf_results = model_selection.cross_validate(estimator=rf,
                                              X=glove_X,
                                              y=y,
                                              cv=kfold,
                                              scoring=scoring)
  print('cross-validation reports (K=5) for random forest + GloVe','\n', 
        'accuracy:', round(100*rf_results['test_accuracy'].mean(),2),'/ Std:',round(np.std(rf_results['test_accuracy']), 2),'\n', 
        'precision:', round(100*rf_results['test_precision'].mean(), 2),'/ Std:',round(np.std(rf_results['test_precision']), 2),'\n', 
        'recall:', round(100*rf_results['test_recall'].mean(), 2),'/ Std:',round(np.std(rf_results['test_recall']), 2),'\n', 
        'F-1:', round(100*rf_results['test_f1_score'].mean(), 2), '/ Std:',round(np.std(rf_results['test_f1_score']), 2))

***5. XGBoost + Count Vector & TF-IDF Vector***


In [None]:


kfold = model_selection.KFold(n_splits=5, random_state=42, shuffle = True)
xgb = XGBClassifier()
data_sets = [df]

# count
for data in data_sets:
  Count = CountVectorizer(ngram_range = (1,2), binary = True, lowercase = True)
  Count_X = Count.fit_transform(data["text"])
  y = data['threat']
  rf_results = model_selection.cross_validate(estimator=xgb,
                                              X=Count_X,
                                              y=y,
                                              cv=kfold,
                                              scoring=scoring)
  print('cross-validation reports for XGBoost + count (K=5)','\n', 
        'accuracy:', round(100*rf_results['test_accuracy'].mean(),2),'/ Std:',round(np.std(rf_results['test_accuracy']), 2),'\n', 
        'precision:', round(100*rf_results['test_precision'].mean(), 2),'/ Std:',round(np.std(rf_results['test_precision']), 2),'\n', 
        'recall:', round(100*rf_results['test_recall'].mean(), 2),'/ Std:',round(np.std(rf_results['test_recall']), 2),'\n', 
        'F-1:', round(100*rf_results['test_f1_score'].mean(), 2), '/ Std:',round(np.std(rf_results['test_f1_score']), 2))
  
# TF-IDF
for data in data_sets:
  Tfidf = TfidfVectorizer(ngram_range = (1,2), lowercase = True)
  Tfidf_X = Tfidf.fit_transform(data["text"])
  y = data['threat']
  rf_results = model_selection.cross_validate(estimator=xgb,
                                              X=Tfidf_X,
                                              y=y,
                                              cv=kfold,
                                              scoring=scoring)
  print('cross-validation reports for XGBoost + TF-IDF (K=5)','\n', 
        'accuracy:', round(100*rf_results['test_accuracy'].mean(),2),'/ Std:',round(np.std(rf_results['test_accuracy']), 2),'\n', 
        'precision:', round(100*rf_results['test_precision'].mean(), 2),'/ Std:',round(np.std(rf_results['test_precision']), 2),'\n', 
        'recall:', round(100*rf_results['test_recall'].mean(), 2),'/ Std:',round(np.std(rf_results['test_recall']), 2),'\n', 
        'F-1:', round(100*rf_results['test_f1_score'].mean(), 2), '/ Std:',round(np.std(rf_results['test_f1_score']), 2))
  

# GloVe

from tqdm import tqdm
import nltk
from nltk import word_tokenize
from nltk import punkt
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')


embeddings_index = {}
f = open('/content/drive/My Drive/diss_detection/python_scripts/glove/glove.6B.100d.txt', encoding="utf8")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
       coefs = np.asarray(values[1:], dtype='float32')
       embeddings_index[word] = coefs
    except ValueError:
       pass
f.close()

def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(100)
    return v / np.sqrt((v ** 2).sum())

for data in data_sets:
  glove_X = [sent2vec(x) for x in tqdm(data["text"])]
  glove_X = np.array(glove_X)
  y = data['threat']
  xgb_results = model_selection.cross_validate(estimator=xgb,
                                              X=glove_X,
                                              y=y,
                                              cv=kfold,
                                              scoring=scoring)
  print('cross-validation reports (K=5) for XGBoost + GloVe','\n', 
        'accuracy:', round(100*xgb_results['test_accuracy'].mean(),2),'/ Std:',round(np.std(xgb_results['test_accuracy']), 2),'\n', 
        'precision:', round(100*xgb_results['test_precision'].mean(), 2),'/ Std:',round(np.std(xgb_results['test_precision']), 2),'\n', 
        'recall:', round(100*xgb_results['test_recall'].mean(), 2),'/ Std:',round(np.std(xgb_results['test_recall']), 2),'\n', 
        'F-1:', round(100*xgb_results['test_f1_score'].mean(), 2), '/ Std:',round(np.std(xgb_results['test_f1_score']), 2))

***6. SVM + Count Vector & TF-IDF Vector***

In [None]:
from sklearn import svm
kfold = model_selection.KFold(n_splits=5, random_state=3, shuffle = True)
svm = svm.SVC(kernel='linear', C = 1)
data_sets = [df]
for data in data_sets:
  Count = CountVectorizer(ngram_range = (1,2), binary = True, lowercase = True)
  Count_X = Count.fit_transform(data["text"])
  y = data['threat']
  svm_results = model_selection.cross_validate(estimator=svm,
                                              X=Count_X,
                                              y=y,
                                              cv=kfold,
                                              scoring=scoring)
  print('cross-validation reports for support vector machine + count (K=5)','\n', 
        'accuracy:', round(100*svm_results['test_accuracy'].mean(),2),'/ Std:',round(np.std(svm_results['test_accuracy']), 2),'\n', 
        'precision:', round(100*svm_results['test_precision'].mean(), 2),'/ Std:',round(np.std(svm_results['test_precision']), 2),'\n', 
        'recall:', round(100*svm_results['test_recall'].mean(), 2),'/ Std:',round(np.std(svm_results['test_recall']), 2),'\n', 
        'F-1:', round(100*svm_results['test_f1_score'].mean(), 2), '/ Std:',round(np.std(svm_results['test_f1_score']), 2))
  
for data in data_sets:
  Tfidf = TfidfVectorizer(ngram_range = (1,2), lowercase = True)
  Tfidf_X = Tfidf.fit_transform(data["text"])
  y = data['threat']
  svm_results = model_selection.cross_validate(estimator=svm,
                                              X=Tfidf_X,
                                              y=y,
                                              cv=kfold,
                                              scoring=scoring)
  print('cross-validation reports for support vector machine + TF-IDF(K=5)','\n', 
        'accuracy:', round(100*svm_results['test_accuracy'].mean(),2),'/ Std:',round(np.std(svm_results['test_accuracy']), 2),'\n', 
        'precision:', round(100*svm_results['test_precision'].mean(), 2),'/ Std:',round(np.std(svm_results['test_precision']), 2),'\n', 
        'recall:', round(100*svm_results['test_recall'].mean(), 2),'/ Std:',round(np.std(svm_results['test_recall']), 2),'\n', 
        'F-1:', round(100*svm_results['test_f1_score'].mean(), 2), '/ Std:',round(np.std(svm_results['test_f1_score']), 2))

# GloVe

from tqdm import tqdm
import nltk
from nltk import word_tokenize
from nltk import punkt
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')


embeddings_index = {}
f = open('/content/drive/My Drive/glove_twitter/glove.twitter.27B.200d.txt', encoding="utf8")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
       coefs = np.asarray(values[1:], dtype='float32')
       embeddings_index[word] = coefs
    except ValueError:
       pass
f.close()

def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(200)
    return v / np.sqrt((v ** 2).sum())

for data in data_sets:
  glove_X = [sent2vec(x) for x in tqdm(data["text"])]
  glove_X = np.array(glove_X)
  y = data['threat']
  svm_results = model_selection.cross_validate(estimator=svm,
                                              X=glove_X,
                                              y=y,
                                              cv=kfold,
                                              scoring=scoring)
  print('cross-validation reports (K=5) for SVM + GloVe','\n', 
        'accuracy:', round(100*svm_results['test_accuracy'].mean(),2),'/ Std:',round(np.std(svm_results['test_accuracy']), 2),'\n', 
        'precision:', round(100*svm_results['test_precision'].mean(), 2),'/ Std:',round(np.std(svm_results['test_precision']), 2),'\n', 
        'recall:', round(100*svm_results['test_recall'].mean(), 2),'/ Std:',round(np.std(svm_results['test_recall']), 2),'\n', 
        'F-1:', round(100*svm_results['test_f1_score'].mean(), 2), '/ Std:',round(np.std(svm_results['test_f1_score']), 2))