In [306]:
import pandas as pd
import nltk
import os
import re
import numpy as np
from nltk import word_tokenize, ngrams
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import string
import gensim
import joblib

HOME_DIR = "/home_remote"

### Data preparation

In [257]:
positives_df_path = os.path.join(HOME_DIR, "positive_df.pkl")
negatives_df_path = os.path.join(HOME_DIR, "negative_df.pkl")

positives = pd.read_pickle(positives_df_path)
negatives = pd.read_pickle(negatives_df_path)

In [260]:
#join tiltle and text
positives['text'] = positives['Title'] + positives['Text']
negatives['text'] = negatives['Title'] + negatives['Text']
#join all text of the same user
pos = positives.groupby('TrainSubjectId')['text'].apply(' '.join).reset_index()
neg = negatives.groupby('TrainSubjectId')['text'].apply(' '.join).reset_index()
#Labelling the data
pos['Label'] = 1
neg['Label'] = 0

In [261]:
#concatenate the data
data = pd.concat([pos, neg], ignore_index=True)
data = data.sample(frac=1).reset_index(drop=True)

data

Unnamed: 0,TrainSubjectId,text,Label
0,train_subject9956,But killing him themselves will give them ...,0
1,train_subject8693,Can...can i put my dick in it? You're...,0
2,train_subject7819,"Psychologists Shielded U.S. Torture Program, ...",0
3,train_subject5510,wtf Hull are in the premiership I sno...,0
4,train_subject489,"Is it you, keyboard cat? What a time ...",0
...,...,...,...
481,train_subject7049,That line cracked me the fuck up. I just ...,0
482,train_subject6802,Yeah i use this: http://imgur.com/TOvn6ij ...,0
483,train_subject3902,My first ever needle felting creation! :D (Bu...,1
484,train_subject5807,Scientists make breakthrough in understanding...,0


### Pre-processing

In [262]:
def process_text(document):

        # Remove extra white space from text
        document = re.sub(r'\s+', ' ', document, flags=re.I)
         
        # Remove all the special characters from text
        document = re.sub(r'\W', ' ', str(document))
 
        # Remove all single characters from text
        #document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
 
        # Converting to lowercase
        document = document.lower()

        return document

In [None]:
#pre-processing for tfidf
def clean_text(text):
    # lower text
    text = text.lower()
     #text = nltk.word_tokenize(text)
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all text
    text = ' '.join(text)
    return(text)

### Feature extraction

In [264]:
#tf-idf
def feature_extract(text, type):
    if type == 'tfidf':
        tfidfconverter = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.7)
        X = tfidfconverter.fit_transform(text).toarray()
    elif type == 'doc2vec':
        pass
    return X 

#### Doc2Vec

In [289]:
#prepare data
#label data for positive and negative
positives['Label'] = 1
negatives['Label'] = 0
#concatenate the data positive and negatives
train = pd.concat([positives, negatives], ignore_index=True)

In [290]:
def read_corpus(df, tokens_only=False):
    for i, line in enumerate(df['text']):
        tokens = gensim.utils.simple_preprocess(line)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(train))
#test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

In [291]:
model = gensim.models.doc2vec.Doc2Vec(documents= train_corpus, dm = 1, vector_size=100, min_count=1, epochs=10, window=10, negative= 20,  alpha=0.025,min_alpha=1e-4)

In [293]:
model2= gensim.models.doc2vec.Doc2Vec(documents= train_corpus, dm = 0, vector_size=100, min_count=1, epochs=10, window=10, sample=1e-4,hs =1,  alpha=0.025,min_alpha=1e-4)

In [295]:
#map train['Vector'] to train_corpus
train['Tag'] = train_corpus
#get tags of train_corpus
tags = [x.tags[0] for x in train_corpus]
train['Vector']= [np.concatenate((model.dv[x], model2.dv[x]), axis=None) for x in tags]

In [296]:
#average of vectors for each user, including the label of user
a = train.groupby('TrainSubjectId').agg({'Vector': 'mean', 'Label': 'first'}).reset_index()

In [315]:
#joblib a model
joblib.dump(lg2, os.path.join(HOME_DIR,'lg2.pkl'))

['/home_remote/lg2.pkl']

### Models

#### Logistic Regression

In [316]:
def logistic_regression(X, y):

    w = [1, 2**1, 2**2, 2**3, 2**4, 2**5, 2**6,2**7, 2**8]
    weight = [{0: 1/(1+x),  1: x/(1+x)} for x in w]
    C = [2**-6, 2**-5, 2**-4, 2**-3, 2**-2, 2**-1, 1, 2**1, 2**2, 2**3, 2**4, 2**5, 2**6]
    # define grid search
    hyperparam_grid = {"class_weight": weight
                    ,"penalty": ["l1", "l2"]
                    ,"C": C
                    ,"fit_intercept": [True, False]  }
    # define evaluation procedure
    cv = KFold(n_splits=10, shuffle=True, random_state=13)
    # define grid search
    model_test = LogisticRegression(solver='liblinear')
    grid = GridSearchCV(estimator=model_test, param_grid=hyperparam_grid, cv=cv, scoring='roc_auc')
    grid_result = grid.fit(X, y)
    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

    #build a model with the best parameters, fix class_weight = (0.2, 0.8)

    model = LogisticRegression(**grid_result.best_params_)
    model.fit(X, y)
    return model

#### LSTM

### Test Models

In [321]:
#TF-IDF
text = data['text'].apply(clean_text)
X_tfidf = feature_extract(text, 'tfidf')
y_tfidf = data['Label']

lg1 = LogisticRegression(C=4, class_weight={0: 0.2, 1: 0.8}, fit_intercept=True, penalty='l1', solver='liblinear')
y_pred = cross_val_predict(lg1, X_tfidf, y_tfidf, cv=cv)
#dataframe of y_pred and y
lg1_train = pd.DataFrame({'Actual': y_tfidf, 'Predicted': y_pred})

result = cross_val_score(lg1, X_tfidf, y_tfidf, cv=cv, scoring='roc_auc')
print("AUC: %.3f (%.3f)" % (result.mean(), result.std()))
print("Accuracy:",accuracy_score(y_tfidf, y_pred))
print("Precision:",precision_score(y_tfidf, y_pred))
print("Recall:",recall_score(y_tfidf, y_pred))
print("F1:",f1_score(y_tfidf, y_pred))

AUC: 0.862 (0.069)
Accuracy: 0.8868312757201646
Precision: 0.6944444444444444
Recall: 0.6024096385542169
F1: 0.6451612903225806


In [319]:
#Doc2Vec
X_doc2vec = a['Vector'].tolist()
y_doc2vec = a['Label'].tolist()

lg2 = logistic_regression(X_doc2vec, y_doc2vec)
y_pred_doc2vc = cross_val_predict(lg2, X_doc2vec, y_doc2vec, cv=cv)
#dataframe of y_pred and y
lg2_train = pd.DataFrame({'Actual': y_doc2vec, 'Predicted': y_pred_doc2vc})

result_doc2vec = cross_val_score(lg2, X_doc2vec, y_doc2vec, cv=cv, scoring='roc_auc')
print("AUC: %.3f (%.3f)" % (result_doc2vec.mean(), result_doc2vec.std()))
print("Accuracy:",accuracy_score(y_doc2vec, y_pred_doc2vc))
print("Precision:",precision_score(y_doc2vec, y_pred_doc2vc))
print("Recall:",recall_score(y_doc2vec, y_pred_doc2vc))
print("F1:",f1_score(y_doc2vec, y_pred_doc2vc))




Best: 0.996927 using {'C': 32, 'class_weight': {0: 0.3333333333333333, 1: 0.6666666666666666}, 'fit_intercept': True, 'penalty': 'l2'}
AUC: 0.997 (0.004)
Accuracy: 0.9732510288065843
Precision: 0.9605263157894737
Recall: 0.8795180722891566
F1: 0.9182389937106917
