In [4]:
import re
import warnings
import stanfordnlp

import pandas as pd
import numpy as np

from sklearn import svm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

from laserembeddings import Laser

Also don't forget download pre-trained models 

$ python -m laserembeddings download-models

In [2]:
dataframes = pd.read_json(open('data/train_dataset.json', 'rb'))

train_x = dataframes['tweet_text'] 
train_y = dataframes['info']

# Report features

In [5]:
warnings.simplefilter("ignore", UserWarning)
features = []

nlp = stanfordnlp.Pipeline(lang='en')
for doc in train_x:
    numbers = np.log(int(len(re.findall('[0-9]+', doc))) + 1)
    urls = np.log(int(len(re.findall('((www\.[\s]+)|(https?://[^\s]+))', doc))) + 1)
    user = np.log(int(len(re.findall('@[A-Za-z0-9]+', doc))) + 1)
            
    nums = 0; nouns = 0; verbs = 0; adverbs = 0; adjectives = 0
    nsubj = 0; nmod = 0; nummod = 0; advmod = 0; roots = 0; compounds = 0
            
    doc = nlp(doc)
    for i, sent in enumerate(doc.sentences):
        for word in sent.words:
            if word.pos == 'NUM':
                nums += 1
            elif word.pos == 'NOUN':
                nouns += 1
            elif word.pos == 'VERB':
                verbs += 1
            elif word.pos == 'ADV':
                adverbs += 1
            elif word.pos == 'ADJ':
                adjectives += 1
                            
            if word.dependency_relation == "nsubj":
                nsubj += 1
            elif word.dependency_relation == "nmod":
                nmod += 1
            elif word.dependency_relation == "nummod":
                nummod += 1
            elif word.dependency_relation == "advmod":
                advmod += 1
            elif word.dependency_relation == "root":
                roots += 1
            elif word.dependency_relation == "compound":
                compounds += 1

    nums = np.log((nums) + 1)
    nouns = np.log((nouns) + 1)
    verbs = np.log((verbs) + 1)
    adverbs = np.log((adverbs) + 1)
    adjectives = np.log((adjectives) + 1)
            
    nsubj = np.log((nsubj) + 1)
    nmod = np.log((nmod) + 1)
    nummod = np.log((nummod) + 1)
    advmod = np.log((advmod) + 1)
    roots = np.log((roots) + 1)
    compounds = np.log((compounds) + 1)
            
    features.append([numbers, urls, user, nums, nouns, verbs, adverbs, adjectives,
                            nsubj, nmod, nummod, advmod, roots, compounds])

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/fv/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/fv/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/Users/fv/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/Users/fv/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/Users/fv/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/Users/fv/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 

In [6]:
from sklearn import svm

clf = svm.SVC(gamma='auto', kernel='poly')
scores = cross_val_score(clf, features, train_y, scoring='roc_auc', cv=10)
scores.mean()

0.4921414857571455

In [57]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_split=2, random_state=0)
scores = cross_val_score(clf, all_features, train_y, scoring='roc_auc', cv=10)
scores.mean()

0.4958658419836944

In [58]:
from sklearn.linear_model import SGDClassifier

text_clf = SGDClassifier(loss='hinge',alpha=1e-3, random_state=42, max_iter=100, tol=None)
scores = cross_val_score(clf, all_features, train_y, scoring='roc_auc', cv=10)
scores.mean()

KeyboardInterrupt: 

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

text_clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=7)
scores = cross_val_score(clf, all_features, train_y, scoring='roc_auc', cv=10)
scores.mean()

In [None]:
from sklearn.svm import LinearSVC

text_clf = LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, 
                     loss='hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=None, 
                     tol=0.0001, verbose=0)
scores = cross_val_score(clf, all_features, train_y, scoring='roc_auc', cv=10)
scores.mean()

In [None]:
from sklearn.neural_network import MLPClassifier

text_clf = MLPClassifier(solver='sgd', alpha=1e-5, hidden_layer_sizes=(10, 5), random_state=1)
scores = cross_val_score(clf, all_features, train_y, scoring='roc_auc', cv=20)
scores.mean()

Results of cross-validation (20):

info — 

gov — 

media — 

eyewitness — 

# CNN and LSTM

In [26]:
import stanfordnlp
from spacy_stanfordnlp import StanfordNLPLanguage

snlp = stanfordnlp.Pipeline(lang="en")
nlp = StanfordNLPLanguage(snlp)

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/fv/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/fv/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/Users/fv/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/Users/fv/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
---
Loading: depparse
With settings: 
{'model_path': '/Users/fv/stanfordnlp_resources/en_ewt_models/en_ewt_parser.pt', 'pretrain_path': '/Users/fv/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 

In [27]:
import warnings
warnings.simplefilter("ignore", UserWarning)

import numpy as np
from nltk.probability import FreqDist

import re

corpus = [] # all words in collection
processed_content = [] # preprocessed messages
string_content = []

print('Corpus creation...')
for row in train_x:
    row = str(row)
    row = re.sub('((www\.[\s]+)|(https?://[^\s]+))', 'URL', row) # URL
    row = re.sub('\_', ' ', row) # _
    row = re.sub('\!', ' ATTENTION', row) # !
    row = re.sub('\?', ' QUESTION', row) # ?
    row = re.sub('@[A-Za-z0-9]+', ' ', row) # mentions
    row = re.sub('\W', ' ', row) # symbols
    row = re.sub('\_', ' ', row) # _
    row = re.sub('[\s]+', ' ', row) # spaces
    new_row = []
    doc = nlp(row.lower())
    string = ''
    for token in doc:
        new_row.append(token.lemma_)
        corpus.append(token.lemma_)
        string += str(token.lemma_) + ' '
    processed_content.append(new_row)
    string_content.append(string)
print("Finish")

Corpus creation...
Finish


In [28]:
fdist = FreqDist(corpus)

print('Stopwords filtering')
x = 1
fdist_dict = dict()
for word, frequency in fdist.most_common():
    if fdist[word] < len(processed_content):
        fdist_dict[word] = x
        x += 1

digit_content = []
for message in processed_content:
    message_meet = []
    for word in message:
        if word in fdist_dict:
            message_meet.append(fdist_dict[word])
    digit_content.append(message_meet)

content = np.array(digit_content)
content

Stopwords filtering


array([list([1, 3150, 75, 457, 6220, 3, 2111, 13, 7, 526, 338, 719, 74, 140, 87, 5, 3151, 12, 917, 3, 144, 435]),
       list([1, 304, 144, 103, 7469, 475, 143, 140, 18, 1388, 36, 209, 13, 1518, 863, 1586, 446, 51, 2]),
       list([1, 144, 103, 209, 5, 720, 13, 84, 132, 158, 5, 30, 523, 297, 209, 13, 7470, 1732, 4, 140, 2]),
       ...,
       list([1, 3, 925, 671, 113, 5361, 82, 1065, 2083, 555, 181, 39, 75, 556, 63, 8, 7, 50, 3, 925, 69, 696, 780]),
       list([1, 4330, 1287, 494, 6, 11, 52, 148, 43, 2358, 15, 2703, 5900, 11, 82, 214, 12514, 115, 32312, 5015]),
       list([1, 173, 3688, 1902, 19, 32313, 3706, 36, 7, 1557, 1, 139, 708, 367, 19, 5128, 6495, 4, 1906, 929, 187, 14173])],
      dtype=object)

In [29]:
# Evaluations
import tensorflow as tf
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def auc(y_true, y_pred):
    auc = tf.metrics.AUC(y_true, y_pred)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

In [None]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding


# Others
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(string_content)

embeddings_index = dict()
f = open('data/glove.twitter.27B.200d.txt') #en — glove.twitter.27B.200d.txt, ru — cc.ru.300.vec, es — glove-sbwc.i25.vec, de — cc.de.300.vec
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((vocabulary_size, 200))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [None]:
#CNN + embeddings
from sklearn.model_selection import train_test_split
from statistics import mean

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D

# set parameters
seed = 7
max_features = 5000
maxlen = 50
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 15


acc_list = []
score_list = []
auc_list = []
precision_list = []
recall_list =[]
x = 0

while x < 10:
    x += 1
    x_train, x_test, y_train, y_test = train_test_split(content, y_train, test_size=0.2, random_state=seed)

    x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

    model = Sequential()
    model.add(Embedding(vocabulary_size, 200, input_length=50, weights=[embedding_matrix], trainable=False))
    model.add(Dropout(0.1))

    # we add a Convolution1D, which will learn filters
    # word group filters of size filter_length:
    model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
    # we use max pooling:
    model.add(GlobalMaxPooling1D())

    # We add a vanilla hidden layer:
    model.add(Dense(hidden_dims))
    model.add(Dropout(0.2))
    model.add(Activation('relu'))

    # We project onto a single unit output layer, and squash it with a sigmoid:
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy', precision, recall])

    model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

    score, acc, precision_score, recall_score = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
    
    acc_list.append(acc)
    score_list.append(score)
    precision_list.append(precision_score)
    recall_list.append(recall_score)

print('Test accuracy:', mean(acc_list))
print('Test precision:', mean(precision_list))
print('Test recall:', mean(precision_list))

# CLASSIC

In [30]:
# Preprocessing modules
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

# Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn import metrics
from statistics import mean

In [31]:
# Models
from sklearn.ensemble import RandomForestClassifier

acc_list = []
precision_list = []
recall_list =[]
auc_list = []
x = 1

while x < 11:
    print ('Validation' , x)
    x += 1
    #Preprocessing
    X_train, X_test, Y_train, Y_test = train_test_split(string_content, train_y, test_size = 0.2, random_state = 0)

    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    X_train_counts.shape

    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    X_train_tf.shape

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    X_train_tfidf.shape

    #RandomForest
    text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_split=20, max_features="auto", random_state=42))])
    text_clf.fit(X_train, Y_train)
    predicted = text_clf.predict(X_test)
    acc_list.append(accuracy_score(Y_test, predicted))
    precision_list.append(precision_score(Y_test, predicted, average='binary'))
    recall_list.append(recall_score(Y_test, predicted, average='binary'))
    fpr, tpr, thresholds = metrics.roc_curve(Y_test, predicted)
    auc_list.append(metrics.auc(fpr, tpr))

print ('Validation finished')

print('Test accuracy:', mean(acc_list))
print('Test precision:', mean(precision_list))
print('Test recall:', mean(precision_list))
print('Test auc:', mean(auc_list))

Validation 1
Validation 2
Validation 3
Validation 4
Validation 5
Validation 6
Validation 7
Validation 8
Validation 9
Validation 10
Validation finished
Test accuracy: 0.5203150170037587
Test precision: 0.5052169137836353
Test recall: 0.5052169137836353
Test auc: 0.5144431891774833


In [19]:
#Stochastic Gradient Descent

from sklearn.linear_model import SGDClassifier

acc_list = []
precision_list = []
recall_list =[]
auc_list = []
x = 1

while x < 11:
    print ('Validation' , x)
    x += 1
    #Preprocessing
    X_train, X_test, Y_train, Y_test = train_test_split(string_content, train_y, test_size = 0.2, random_state = 0)

    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    X_train_counts.shape

    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    X_train_tf.shape

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    X_train_tfidf.shape
    
    text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge',alpha=1e-3, random_state=42, max_iter=100, tol=None))])
    text_clf.fit(X_train, Y_train)
    predicted = text_clf.predict(X_test)

    
    acc_list.append(accuracy_score(Y_test, predicted))
    precision_list.append(precision_score(Y_test, predicted, average='binary'))
    recall_list.append(recall_score(Y_test, predicted, average='binary'))
    fpr, tpr, thresholds = metrics.roc_curve(Y_test, predicted)
    auc_list.append(metrics.auc(fpr, tpr))

print ('Validation finished')

print('Test accuracy:', mean(acc_list))
print('Test precision:', mean(precision_list))
print('Test recall:', mean(precision_list))
print('Test auc:', mean(auc_list))

Validation 1
Validation 2
Validation 3
Validation 4
Validation 5
Validation 6
Validation 7
Validation 8
Validation 9
Validation 10
Validation finished
Test accuracy: 0.675
Test precision: 0.8775510204081632
Test recall: 0.8775510204081632
Test auc: 0.5546900708360033


In [20]:
#ExtraTrees
from sklearn.ensemble import ExtraTreesClassifier

acc_list = []
precision_list = []
recall_list =[]
auc_list = []
x = 1

while x < 11:
    print ('Validation' , x)
    x += 1
    #Preprocessing
    X_train, X_test, Y_train, Y_test = train_test_split(string_content, train_y, test_size = 0.2, random_state = 0)

    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    X_train_counts.shape

    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    X_train_tf.shape

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    X_train_tfidf.shape

    text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0))])
    text_clf.fit(X_train, Y_train)
    predicted = text_clf.predict(X_test)
    
    acc_list.append(accuracy_score(Y_test, predicted))
    precision_list.append(precision_score(Y_test, predicted, average='binary'))
    recall_list.append(recall_score(Y_test, predicted, average='binary'))
    fpr, tpr, thresholds = metrics.roc_curve(Y_test, predicted)
    auc_list.append(metrics.auc(fpr, tpr))

print ('Validation finished')

print('Test accuracy:', mean(acc_list))
print('Test precision:', mean(precision_list))
print('Test recall:', mean(precision_list))
print('Test auc:', mean(auc_list))

Validation 1
Validation 2
Validation 3
Validation 4
Validation 5
Validation 6
Validation 7
Validation 8
Validation 9
Validation 10
Validation finished
Test accuracy: 0.656
Test precision: 0.5548780487804879
Test recall: 0.5548780487804879
Test auc: 0.5684805763868442


In [21]:
#LinearSVM
from sklearn.svm import LinearSVC

acc_list = []
precision_list = []
recall_list =[]
auc_list = []
x = 1

while x < 11:
    print ('Validation' , x)
    x += 1
    #Preprocessing
    X_train, X_test, Y_train, Y_test = train_test_split(string_content, train_y, test_size = 0.2, random_state = 0)

    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X_train)
    X_train_counts.shape

    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    X_train_tf.shape

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    X_train_tfidf.shape

    text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=42, tol=0.0001, verbose=0))])
    text_clf.fit(X_train, Y_train)
    predicted = text_clf.predict(X_test)

    acc_list.append(accuracy_score(Y_test, predicted))
    precision_list.append(precision_score(Y_test, predicted, average='binary'))
    recall_list.append(recall_score(Y_test, predicted, average='binary'))
    fpr, tpr, thresholds = metrics.roc_curve(Y_test, predicted)
    auc_list.append(metrics.auc(fpr, tpr))

print ('Validation finished')

print('Test accuracy:', mean(acc_list))
print('Test precision:', mean(precision_list))
print('Test recall:', mean(precision_list))
print('Test auc:', mean(auc_list))

Validation 1
Validation 2
Validation 3
Validation 4
Validation 5
Validation 6
Validation 7
Validation 8
Validation 9
Validation 10
Validation finished
Test accuracy: 0.652
Test precision: 0.5514705882352942
Test recall: 0.5514705882352942
Test auc: 0.5557855175877656
