In [1]:
import pandas as pd
import nltk
import os
import re
import numpy as np
from nltk import word_tokenize, ngrams
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import string
import gensim
import joblib
from gensim.test.utils import get_tmpfile

HOME_DIR = "/home_remote"

### Data preparation

In [6]:
positives_df_path = os.path.join(HOME_DIR, "positive_df.pkl")
negatives_df_path = os.path.join(HOME_DIR, "negative_df.pkl")

positives = pd.read_pickle(positives_df_path)
negatives = pd.read_pickle(negatives_df_path)

In [7]:
#join tiltle and text
positives['text'] = positives['Title'] + positives['Text']
negatives['text'] = negatives['Title'] + negatives['Text']
#join all text of the same user
pos = positives.groupby('TrainSubjectId')['text'].apply(' '.join).reset_index()
neg = negatives.groupby('TrainSubjectId')['text'].apply(' '.join).reset_index()
#Labelling the data
pos['Label'] = 1
neg['Label'] = 0

In [8]:
#concatenate the data
data = pd.concat([pos, neg], ignore_index=True)
data = data.sample(frac=1).reset_index(drop=True)

data

Unnamed: 0,TrainSubjectId,text,Label
0,train_subject4454,Illegal aliens suspected in murder of nurse s...,0
1,train_subject8344,Well it's not as if they really need to tr...,0
2,train_subject9884,I think everybody does. Personally I can't...,0
3,train_subject9201,Or a golden tuffalo. I think the real...,0
4,train_subject8559,Little 7 year old Daisy having a nap For ...,0
...,...,...,...
481,train_subject2418,TIFU by making a 'shitty' joke at work I jus...,0
482,train_subject1879,"How did you get out?\nalso, congratulation...",1
483,train_subject6188,A U.S.-led coalition dropped new leaflets ove...,0
484,train_subject1839,"I did. I was always an incredibly morose, ...",1


### Pre-processing

In [262]:
def process_text(document):

        # Remove extra white space from text
        document = re.sub(r'\s+', ' ', document, flags=re.I)
         
        # Remove all the special characters from text
        document = re.sub(r'\W', ' ', str(document))
 
        # Remove all single characters from text
        #document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
 
        # Converting to lowercase
        document = document.lower()

        return document

In [390]:
#pre-processing for tfidf
def clean_text(text):
    # lower text
    text = text.lower()
     #text = nltk.word_tokenize(text)
    text = [word.strip(string.punctuation) for word in text.split(" ")]
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    # remove empty tokens
    text = [t for t in text if len(t) > 0]
    # remove words with only one letter
    text = [t for t in text if len(t) > 1]
    # join all text
    text = ' '.join(text)
    return(text)

### Feature extraction

In [438]:
#tf-idf
def feature_extract(text, type):
    if type == 'tfidf':
        tfidfconverter = TfidfVectorizer(max_features=1000, max_df=0.7, min_df=0.1)
        X = tfidfconverter.fit_transform(text).toarray()
    elif type == 'doc2vec':
        pass
    return X, tfidfconverter

#### Doc2Vec

In [9]:
#prepare data
#label data for positive and negative
positives['Label'] = 1
negatives['Label'] = 0
#concatenate the data positive and negatives
train = pd.concat([positives, negatives], ignore_index=True)

In [10]:
def read_corpus(df, tokens_only=False):
    for i, line in enumerate(df['text']):
        tokens = gensim.utils.simple_preprocess(line)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(train))
#test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

In [11]:
model = gensim.models.doc2vec.Doc2Vec(documents= train_corpus, dm = 1, vector_size=100, min_count=1, epochs=10, window=10, negative= 20,  alpha=0.025,min_alpha=1e-4)

In [12]:
model2= gensim.models.doc2vec.Doc2Vec(documents= train_corpus, dm = 0, vector_size=100, min_count=1, epochs=10, window=10, sample=1e-4,hs =1,  alpha=0.025,min_alpha=1e-4)

In [13]:
#map train['Vector'] to train_corpus
train['Tag'] = train_corpus
#get tags of train_corpus
tags = [x.tags[0] for x in train_corpus]
train['Vector']= [np.concatenate((model.dv[x], model2.dv[x]), axis=None) for x in tags]

In [14]:
#average of vectors for each user, including the label of user
a = train.groupby('TrainSubjectId').agg({'Vector': 'mean', 'Label': 'first'}).reset_index()

In [16]:
X_doc2vec = np.array(a['Vector'].tolist())
y_doc2vec = np.array(a['Label'].tolist())

In [315]:
#joblib a model
joblib.dump(lg2, os.path.join(HOME_DIR,'lg2.pkl'))

['/home_remote/lg2.pkl']

### Models

#### Logistic Regression

In [410]:
def logistic_regression(X, y):

    w = [1, 2**1, 2**2, 2**3, 2**4, 2**5, 2**6,2**7, 2**8]
    weight = [{0: 1/(1+x),  1: x/(1+x)} for x in w]
    C = [2**-6, 2**-5, 2**-4, 2**-3, 2**-2, 2**-1, 1, 2**1, 2**2, 2**3, 2**4, 2**5, 2**6]
    # define grid search
    hyperparam_grid = {"class_weight": weight
                    ,"penalty": ["l1", "l2"]
                    ,"C": C
                    ,"fit_intercept": [True, False]  }
    # define evaluation procedure
    cv = KFold(n_splits=10, shuffle=True, random_state=13)
    # define grid search
    model_test = LogisticRegression(solver='liblinear')
    grid = GridSearchCV(estimator=model_test, param_grid=hyperparam_grid, cv=cv, scoring='roc_auc')
    grid_result = grid.fit(X, y)
    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

    #build a model with the best parameters, fix class_weight = (0.2, 0.8)

    model = LogisticRegression(**grid_result.best_params_)
    model.fit(X, y)
    return model

#### LSTM

### Test Models

In [439]:
#TF-IDF
text = data['text'].apply(clean_text)
X_tfidf, tfidf_model = feature_extract(text, 'tfidf')
y_tfidf = data['Label']

lg1 = LogisticRegression(C=4, class_weight={0: 0.2, 1: 0.8}, fit_intercept=True, penalty='l1', solver='liblinear')
y_pred = cross_val_predict(lg1, X_tfidf, y_tfidf, cv=cv)
#dataframe of y_pred and y
lg1_train = pd.DataFrame({'Actual': y_tfidf, 'Predicted': y_pred})

result = cross_val_score(lg1, X_tfidf, y_tfidf, cv=cv, scoring='roc_auc')
print("AUC: %.3f (%.3f)" % (result.mean(), result.std()))
print("Accuracy:",accuracy_score(y_tfidf, y_pred))
print("Precision:",precision_score(y_tfidf, y_pred))
print("Recall:",recall_score(y_tfidf, y_pred))
print("F1:",f1_score(y_tfidf, y_pred))

AUC: 0.841 (0.087)
Accuracy: 0.8868312757201646
Precision: 0.6891891891891891
Recall: 0.6144578313253012
F1: 0.6496815286624203


In [319]:
#Doc2Vec
X_doc2vec = a['Vector'].tolist()
y_doc2vec = a['Label'].tolist()

lg2 = logistic_regression(X_doc2vec, y_doc2vec)
y_pred_doc2vc = cross_val_predict(lg2, X_doc2vec, y_doc2vec, cv=cv)
#dataframe of y_pred and y
lg2_train = pd.DataFrame({'Actual': y_doc2vec, 'Predicted': y_pred_doc2vc})

result_doc2vec = cross_val_score(lg2, X_doc2vec, y_doc2vec, cv=cv, scoring='roc_auc')
print("AUC: %.3f (%.3f)" % (result_doc2vec.mean(), result_doc2vec.std()))
print("Accuracy:",accuracy_score(y_doc2vec, y_pred_doc2vc))
print("Precision:",precision_score(y_doc2vec, y_pred_doc2vc))
print("Recall:",recall_score(y_doc2vec, y_pred_doc2vc))
print("F1:",f1_score(y_doc2vec, y_pred_doc2vc))




Best: 0.996927 using {'C': 32, 'class_weight': {0: 0.3333333333333333, 1: 0.6666666666666666}, 'fit_intercept': True, 'penalty': 'l2'}
AUC: 0.997 (0.004)
Accuracy: 0.9732510288065843
Precision: 0.9605263157894737
Recall: 0.8795180722891566
F1: 0.9182389937106917


In [322]:
lg1 = LogisticRegression(C=4, class_weight={0: 0.2, 1: 0.8}, fit_intercept=True, penalty='l1', solver='liblinear')

In [323]:
lg1.fit(X_tfidf, y_tfidf)

LogisticRegression(C=4, class_weight={0: 0.2, 1: 0.8}, penalty='l1',
                   solver='liblinear')

In [325]:
lg2= logistic_regression(X_doc2vec, y_doc2vec)
lg2.fit(X_doc2vec, y_doc2vec)


Best: 0.996927 using {'C': 32, 'class_weight': {0: 0.3333333333333333, 1: 0.6666666666666666}, 'fit_intercept': True, 'penalty': 'l2'}


LogisticRegression(C=32,
                   class_weight={0: 0.3333333333333333, 1: 0.6666666666666666})

In [326]:
joblib.dump(lg2, os.path.join(HOME_DIR,'lg2.pkl'))

['/home_remote/lg2.pkl']

In [329]:
#joblib a model
joblib.dump(model, os.path.join(HOME_DIR,'model_doc2vec.pkl'))
joblib.dump(model2, os.path.join(HOME_DIR,'model2_doc2vec.pkl'))

['/home_remote/model2_doc2vec.pkl']

In [333]:
#save model
fname = get_tmpfile(os.path.join(HOME_DIR,"master_thesis/model_evaluation/my_doc2vec_model"))
model.save(fname)
fname2 = get_tmpfile(os.path.join(HOME_DIR,"master_thesis/model_evaluation/my_doc2vec_model2"))
model2.save(fname2)

In [350]:
tt = train['text'][0:3].str.cat(sep=' ')

In [356]:
model.infer_vector(read_corpus(tt, tokens_only=False))

array([-2.4728922e-03,  1.1294025e-03, -2.4310261e-04, -9.6121192e-04,
        4.7670985e-03,  1.4386200e-03, -3.0160856e-03, -4.8014042e-03,
        1.2395275e-03,  2.8639727e-03,  4.8470020e-04,  1.5936660e-03,
       -3.5161019e-04, -4.6375393e-05,  3.9771260e-03,  4.8645083e-03,
       -6.5763446e-04,  8.5542200e-04,  3.2963913e-03, -1.0223204e-03,
        1.3308584e-03, -3.0275257e-04,  3.9787532e-04, -3.4252019e-03,
        4.0295329e-03,  4.9934722e-03, -2.5388356e-03,  3.1203639e-03,
        3.1022662e-03,  2.1049499e-03,  2.4390297e-03, -4.6553402e-03,
       -1.4479510e-03,  5.9771957e-04, -1.0302877e-03, -4.6756309e-03,
       -3.4628245e-03,  4.5805462e-03,  1.5165460e-03, -1.1353013e-03,
       -9.7143231e-04,  3.7325716e-03,  1.4615315e-03, -4.1971123e-03,
       -9.2977466e-04, -2.8820890e-03, -2.9301124e-03, -1.8280179e-03,
       -3.2186517e-03, -7.8540383e-04, -6.8161875e-04,  8.2067726e-04,
        3.9903154e-03,  1.3948452e-03,  4.7700340e-03,  1.1494773e-03,
      

In [357]:
doc2vec = joblib.load(os.path.join(HOME_DIR, "lg2.pkl"))

### Test saved model

In [520]:
conventional_model = LogisticRegression(solver='liblinear', C = 32, class_weight={0: 0.2, 1: 0.8}, fit_intercept=True, penalty='l2')

In [521]:
conventional_model.fit(X_doc2vec, y_doc2vec)

LogisticRegression(C=32, class_weight={0: 0.2, 1: 0.8}, solver='liblinear')

In [523]:
y_pred2 = conventional_model.predict(X_doc2vec)

In [524]:
#compare y_pred2 and y_doc2vec
con = pd.DataFrame({'Actual': y_doc2vec, 'Predicted': y_pred2})
#misclassified
con[con['Actual'] != con['Predicted']]

Unnamed: 0,Actual,Predicted
67,1,0
234,0,1
241,1,0
252,1,0
330,0,1


In [525]:
#save model conventional_model
joblib.dump(conventional_model, os.path.join(HOME_DIR,'conventional_model.pkl'))

['/home_remote/conventional_model.pkl']

In [526]:
#load model
conventional_model_loaded = joblib.load(os.path.join(HOME_DIR, "conventional_model.pkl"))

In [530]:
conventional_model_loaded.predict(X_doc2vec[67].reshape(1, -1))

array([0])

### LSTM


In [17]:
#train-test split X_doc2vec, y_doc2vec
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_doc2vec, y_doc2vec, test_size=0.2, random_state=13)

In [29]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

# Define the LSTM model
model = Sequential()
model.add(input_shape=(10, 1))  # Embedding layer for word embeddings
model.add(LSTM(128))  # LSTM layer with 128 units
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification with sigmoid activation

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Predict probabilities on new data
X_new_data = np.random.rand(10, 10, 100)  # Replace with your new data
predicted_probabilities = model.predict(X_new_data)
print(predicted_probabilities)


2023-10-26 18:24:52.625036: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-26 18:24:53.220730: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10794 MB memory:  -> device: 0, name: Tesla K80, pci bus id: 0000:8d:00.0, compute capability: 3.7


TypeError: add() got an unexpected keyword argument 'input_shape'

In [60]:
model2.infer_vector(train_corpus[0])

TypeError: sequence item 0: expected str instance, list found

In [78]:
lg2 = joblib.load(os.path.join(HOME_DIR, "lg2.pkl"))

In [105]:
lg2.predict_proba(X_doc2vec[8].reshape(1, -1))[0,1]

0.9857112907136578

In [85]:
y_doc2vec

array([0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,