Author notes (Taegyoon Kim, taegyoon@psu.edu)

---


- This is a notebook for classifiers for violent political rhetoric (https://osf.io/5ckw4/). 
- The input data is tweet text that contains one or more of the violent keywords extracted using the violent keyword extractor (https://github.com/taegyoon-kim/violent_political_rheotric_on_twitter/blob/master/violent_political_rhetoric_violent_keyword_extract.py). 
- The training data is available upon request via email. 
- The notebook will be fully available upon publication of the paper



Drive mount


---



In [None]:
from google.colab import drive
drive.mount("/content/drive")

Packages


---



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import model_selection

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import cross_val_score

Load data


---



In [None]:
url = '' # training set

df = pd.read_csv(url)
df['text'] = df['status_final_text']
df['threat'] = df['final_binary'].astype(float)
df = df[['text','threat']]

df = df.sample(frac=1).reset_index(drop=True)

Evaluation metrics


---



In [None]:
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

def report_results(A, B):
    
    df = pd.DataFrame({'A':A,
                       'B':B})
    df = df.dropna()
    A = df['A']
    B = df['B']
    
    acc = accuracy_score(B, A)
    f1 = f1_score(B, A)
    prec = precision_score(B, A)
    rec = recall_score(B, A)

    performance = [acc, prec, rec, f1]

    return performance

Logistic regression + Count Vector & TF-IDF Vector & Word Embeddings


---



In [None]:
##### count vector


count = CountVectorizer(ngram_range = (1,2), binary = True, lowercase = True) # you can cahnge arguments here depending on how you want the text to be pre-processed/represented as a matrix
train_val_X = count.fit_transform(df['text'])
train_val_y = df['threat']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 1) # you set the number of folds for cross-validation

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  
  train_X = train_val_X[train_index,:]
  val_X = train_val_X[val_index,:]

  train_y = train_val_y[train_index]
  val_y = train_val_y[val_index]

  lr = LogisticRegression(C=1, random_state=7, solver='sag', max_iter=2000, n_jobs=-1) # this is where your can change hyper-parameters for logistic regression
  lr.fit(train_X, train_y)
  pred_y = lr.predict(val_X)
  
  lr_performance = report_results(pred_y, val_y)
  cv_acc.append(lr_performance[0])
  cv_pre.append(lr_performance[1])
  cv_rec.append(lr_performance[2])
  cv_f1.append(lr_performance[3])

print('\nLR + count')
print('- acc:', round(np.mean(cv_acc)*100,2))
print('- pre:', round(np.mean(cv_pre)*100,2))
print('- rec:', round(np.mean(cv_rec)*100,2))
print('- f1:', round(np.mean(cv_f1)*100,2))


##### TFIDF vector


tfidf = TfidfVectorizer(ngram_range = (1,2), lowercase = True)
train_val_X = tfidf.fit_transform(df['text'])
train_val_y = df['threat']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 1)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  
  train_X = train_val_X[train_index,:]
  val_X = train_val_X[val_index,:]

  train_y = train_val_y[train_index]
  val_y = train_val_y[val_index]

  lr = LogisticRegression(C=1, random_state=7, solver='sag', max_iter=2000, n_jobs=-1)
  lr.fit(train_X, train_y)
  pred_y = lr.predict(val_X)
  
  lr_performance = report_results(pred_y, val_y)
  cv_acc.append(lr_performance[0])
  cv_pre.append(lr_performance[1])
  cv_rec.append(lr_performance[2])
  cv_f1.append(lr_performance[3])

print('\nLR + tfidf')
print('- acc:', round(np.mean(cv_acc)*100,2))
print('- pre:', round(np.mean(cv_pre)*100,2))
print('- rec:', round(np.mean(cv_rec)*100,2))
print('- f1:', round(np.mean(cv_f1)*100,2))


##### glove


from tqdm import tqdm
import nltk
from nltk import word_tokenize
from nltk import punkt
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')

embeddings_index = {}
f = open('glove.6B.200d.txt', encoding="utf8")  # this is where glove embedding file is stored
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
       coefs = np.asarray(values[1:], dtype='float32')
       embeddings_index[word] = coefs
    except ValueError:
       pass
f.close()


def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(200)
    return v / np.sqrt((v ** 2).sum())


glove_X = [sent2vec(x) for x in tqdm(df["text"])]
glove_X = np.array(glove_X)
y = df['threat']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 1)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  train_X = glove_X[train_index,:]
  val_X = glove_X[val_index,:]

  train_y = y[train_index]
  val_y = y[val_index]

  lr = LogisticRegression(C=1, random_state=7, solver='sag', max_iter=2000, n_jobs=-1)
  lr.fit(train_X, train_y)
  pred_y = lr.predict(val_X)
  
  lr_performance = report_results(pred_y, val_y)
  cv_acc.append(lr_performance[0])
  cv_pre.append(lr_performance[1])
  cv_rec.append(lr_performance[2])
  cv_f1.append(lr_performance[3])

print('\nLR + glove_200d')
print('- acc:', round(np.mean(cv_acc)*100,2))
print('- pre:', round(np.mean(cv_pre)*100,2))
print('- rec:', round(np.mean(cv_rec)*100,2))
print('- f1:', round(np.mean(cv_f1)*100,2))

Random Forest + Count Vector & TF-IDF Vector & Word Embeddings


---



In [None]:
##### count vector

count = CountVectorizer(ngram_range = (1,2), binary = True, lowercase = True)
train_val_X = count.fit_transform(df['text'])
train_val_y = df['threat']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 1) 

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  
  train_X = train_val_X[train_index,:]
  val_X = train_val_X[val_index,:]

  train_y = train_val_y[train_index]
  val_y = train_val_y[val_index]

  rf = RandomForestClassifier(n_estimators=500, random_state=0) # this is where your can change hyper-parameters for random forest 
  rf.fit(train_X, train_y)
  pred_y = rf.predict(val_X)
  
  rf_performance = report_results(pred_y, val_y)
  cv_acc.append(rf_performance[0])
  cv_pre.append(rf_performance[1])
  cv_rec.append(rf_performance[2])
  cv_f1.append(rf_performance[3])

print('\nRF + count')
print('- acc:', round(np.mean(cv_acc)*100,2))
print('- pre:', round(np.mean(cv_pre)*100,2))
print('- rec:', round(np.mean(cv_rec)*100,2))
print('- f1:', round(np.mean(cv_f1)*100,2))


##### TFIDF vector


tfidf = TfidfVectorizer(ngram_range = (1,2), lowercase = True)
train_val_X = tfidf.fit_transform(df['text'])
train_val_y = df['threat']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 1)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  
  train_X = train_val_X[train_index,:]
  val_X = train_val_X[val_index,:]

  train_y = train_val_y[train_index]
  val_y = train_val_y[val_index]

  rf = RandomForestClassifier(n_estimators=500, random_state=0)
  rf.fit(train_X, train_y)
  pred_y = rf.predict(val_X)
  
  rf_performance = report_results(pred_y, val_y)
  cv_acc.append(rf_performance[0])
  cv_pre.append(rf_performance[1])
  cv_rec.append(rf_performance[2])
  cv_f1.append(rf_performance[3])

print('\nRF + tfidf')
print('- acc:', round(np.mean(cv_acc)*100,2))
print('- pre:', round(np.mean(cv_pre)*100,2))
print('- rec:', round(np.mean(cv_rec)*100,2))
print('- f1:', round(np.mean(cv_f1)*100,2))


##### glove


from tqdm import tqdm
import nltk
from nltk import word_tokenize
from nltk import punkt
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')

embeddings_index = {}
f = open('glove.6B.200d.txt', encoding="utf8")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
       coefs = np.asarray(values[1:], dtype='float32')
       embeddings_index[word] = coefs
    except ValueError:
       pass
f.close()


def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(200)
    return v / np.sqrt((v ** 2).sum())

glove_X = [sent2vec(x) for x in tqdm(df["text"])]
glove_X = np.array(glove_X)
y = df['threat']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 1)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  train_X = glove_X[train_index,:]
  val_X = glove_X[val_index,:]

  train_y = y[train_index]
  val_y = y[val_index]

  rf = RandomForestClassifier(n_estimators=500, random_state=0)
  rf.fit(train_X, train_y)
  pred_y = rf.predict(val_X)
  
  rf_performance = report_results(pred_y, val_y)
  cv_acc.append(rf_performance[0])
  cv_pre.append(rf_performance[1])
  cv_rec.append(rf_performance[2])
  cv_f1.append(rf_performance[3])

print('\nRF + glove_100d')
print('- acc:', round(np.mean(cv_acc)*100,2))
print('- pre:', round(np.mean(cv_pre)*100,2))
print('- rec:', round(np.mean(cv_rec)*100,2))
print('- f1:', round(np.mean(cv_f1)*100,2))

XGBoost + Count Vector & TF-IDF Vector & Word Embeddings


---



In [None]:
##### count vector


count = CountVectorizer(ngram_range = (1,2), binary = True, lowercase = True)
train_val_X = count.fit_transform(df['text'])
train_val_y = df['threat']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 1)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  
  train_X = train_val_X[train_index,:]
  val_X = train_val_X[val_index,:]

  train_y = train_val_y[train_index]
  val_y = train_val_y[val_index]

  xgb = XGBClassifier()
  xgb.fit(train_X, train_y)
  pred_y = xgb.predict(val_X)
  
  xgb_performance = report_results(pred_y, val_y)
  cv_acc.append(xgb_performance[0])
  cv_pre.append(xgb_performance[1])
  cv_rec.append(xgb_performance[2])
  cv_f1.append(xgb_performance[3])

print('\nXGB + count')
print('- acc:', round(np.mean(cv_acc)*100,2))
print('- pre:', round(np.mean(cv_pre)*100,2))
print('- rec:', round(np.mean(cv_rec)*100,2))
print('- f1:', round(np.mean(cv_f1)*100,2))


##### TFIDF vector


tfidf = TfidfVectorizer(ngram_range = (1,2), lowercase = True)
train_val_X = tfidf.fit_transform(df['text'])
train_val_y = df['threat']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 1)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  
  train_X = train_val_X[train_index,:]
  val_X = train_val_X[val_index,:]

  train_y = train_val_y[train_index]
  val_y = train_val_y[val_index]

  xgb = XGBClassifier() # this is where your can change hyper-parameters for XGBoost
  xgb.fit(train_X, train_y)
  pred_y = xgb.predict(val_X)
  
  xgb_performance = report_results(pred_y, val_y)
  cv_acc.append(xgb_performance[0])
  cv_pre.append(xgb_performance[1])
  cv_rec.append(xgb_performance[2])
  cv_f1.append(xgb_performance[3])

print('\nXGB + tfidf')
print('- acc:', round(np.mean(cv_acc)*100,2))
print('- pre:', round(np.mean(cv_pre)*100,2))
print('- rec:', round(np.mean(cv_rec)*100,2))
print('- f1:', round(np.mean(cv_f1)*100,2))


##### glove


from tqdm import tqdm
import nltk
from nltk import word_tokenize
from nltk import punkt
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')

embeddings_index = {}
f = open('glove.6B.200d.txt', encoding="utf8")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
       coefs = np.asarray(values[1:], dtype='float32')
       embeddings_index[word] = coefs
    except ValueError:
       pass
f.close()


def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(200)
    return v / np.sqrt((v ** 2).sum())


glove_X = [sent2vec(x) for x in tqdm(df["text"])]
glove_X = np.array(glove_X)
y = df['threat']

from sklearn.model_selection import KFold
kf = KFold(n_splits = 5, shuffle = True, random_state= 1)

cv_acc = []
cv_pre= []
cv_rec = []
cv_f1 = []

for train_index, val_index in kf.split(train_val_X):
  train_X = glove_X[train_index,:]
  val_X = glove_X[val_index,:]

  train_y = y[train_index]
  val_y = y[val_index]

  xgb = XGBClassifier()
  xgb.fit(train_X, train_y)
  pred_y = xgb.predict(val_X)
  
  xgb_performance = report_results(pred_y, val_y)
  cv_acc.append(xgb_performance[0])
  cv_pre.append(xgb_performance[1])
  cv_rec.append(xgb_performance[2])
  cv_f1.append(xgb_performance[3])

print('\nXGB + glove_200d')
print('- acc:', round(np.mean(cv_acc)*100,2))
print('- pre:', round(np.mean(cv_pre)*100,2))
print('- rec:', round(np.mean(cv_rec)*100,2))
print('- f1:', round(np.mean(cv_f1)*100,2))