In [56]:
import pandas as pd 
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
stop_words = stopwords.words('english')
import xgboost as xgb
import pickle
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.model_selection import cross_validate
from sklearn.exceptions import ConvergenceWarning
from warnings import simplefilter
simplefilter("ignore", category=ConvergenceWarning)


In [15]:
# train = pd.read_csv('train.csv')
amzn_train = pd.read_pickle("amazon_data_for_training.pkl", compression='infer')
amzn_train.head()
train = amzn_train[['article_content', 'trend']]
train.columns = ['text', 'target']
train.head()

Unnamed: 0,text,target
12886,here are some things going on today in your wo...,1
13571,shutterstock photo\nstocks indexes opened the ...,1
11837,by ryan vlastelica\nto simply match the market...,1
13804,what happened shares of many optical networkin...,1
15737,by nigam arora\nthe practical way to take adva...,1


In [16]:
train.shape

(5677, 2)

In [17]:
# split train data into train and validation sets
x_train, x_valid, y_train, y_valid = train_test_split(train.text.values, train.target.values,
                                                        stratify = train.target.values,
                                                        random_state = 42,
                                                        test_size = 0.1,
                                                        shuffle = True)

In [18]:
# tf-idf vectorization
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

tfv.fit(list(x_train) + list(x_valid))
xtrain_tfv = tfv.transform(x_train)
xvalid_tfv = tfv.transform(x_valid)

In [19]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(x_train) + list(x_valid))
xtrain_ctv =  ctv.transform(x_train) 
xvalid_ctv = ctv.transform(x_valid)

In [54]:
# logistic regression with tf-idf
clf = LogisticRegression(C = 1.0)
scoring = {'accuracy' : make_scorer(accuracy_score),'f1' : make_scorer(f1_score)}
scores = cross_validate(clf, xtrain_tfv, y_train, cv=5, scoring = scoring)
acc = scores.get('test_accuracy')
f1 = scores.get('test_f1')
print(f'mean k-fold accuracy: {np.mean(acc)} \nmean k-fold f1: {np.mean(f1)}')

mean k-fold accuracy: 0.7228426142974061 
mean k-fold f1: 0.6311187333385699


In [57]:
# logistic regression with n-gram
clf = LogisticRegression(C = 1.0)
scores = cross_validate(clf, xtrain_ctv, y_train, cv=5, scoring = scoring)
acc = scores.get('test_accuracy')
f1 = scores.get('test_f1')
print(f'mean k-fold accuracy: {np.mean(acc)} \nmean k-fold f1: {np.mean(f1)}')

mean k-fold accuracy: 0.7204921693363054 
mean k-fold f1: 0.6869121162715618


In [61]:
# Fitting a simple Naive Bayes on tf-idf
clf = MultinomialNB()
scores = cross_validate(clf, xtrain_tfv, y_train, cv=5, scoring = scoring)
acc = scores.get('test_accuracy')
f1 = scores.get('test_f1')
print(f'mean k-fold accuracy: {np.mean(acc)} \nmean k-fold f1: {np.mean(f1)}')

mean k-fold accuracy: 0.6834993511982228 
mean k-fold f1: 0.5024658467032534


In [60]:
# Fitting a simple Naive Bayes on n-gram
clf = MultinomialNB()
scores = cross_validate(clf, xtrain_ctv, y_train, cv=5, scoring = scoring)
acc = scores.get('test_accuracy')
f1 = scores.get('test_f1')
print(f'mean k-fold accuracy: {np.mean(acc)} \nmean k-fold f1: {np.mean(f1)}')

mean k-fold accuracy: 0.7238241545930757 
mean k-fold f1: 0.6860893079292215


# GloVe

In [24]:
# load the GloVe vectors in a dictionary:

embeddings_index = {}
f = open('glove.840B.300d/glove.840B.300d.txt')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        pass
f.close()

print('Found %s word vectors.' % len(embeddings_index))

2196017it [01:40, 21770.85it/s]

Found 2195884 word vectors.





In [25]:
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [26]:
# create sentence vectors using the above function for training and validation set
xtrain_glove = [sent2vec(x) for x in tqdm(x_train)]
xvalid_glove = [sent2vec(x) for x in tqdm(x_valid)]

100%|██████████| 5109/5109 [00:42<00:00, 121.34it/s]
100%|██████████| 568/568 [00:04<00:00, 127.68it/s]


In [62]:
# logistic regression with GloVe
clf = LogisticRegression(C = 1.0)
scores = cross_validate(clf, xtrain_glove, y_train, cv=5, scoring = scoring)
acc = scores.get('test_accuracy')
f1 = scores.get('test_f1')
print(f'mean k-fold accuracy: {np.mean(acc)} \nmean k-fold f1: {np.mean(f1)}')

mean k-fold accuracy: 0.5562721019069213 
mean k-fold f1: 0.24653244807902813


In [28]:
# Fitting a simple Naive Bayes on glove (failing due to negative values)
# clf = MultinomialNB()
# clf.fit(xtrain_glove, y_train)
# predictions = clf.predict(xvalid_glove)
# accuracy_score(predictions, y_valid)

In [64]:
# perform pca on glove data
pca = PCA(n_components = 10)
pca.fit(xtrain_glove)
xtrain_glove_pca = pca.transform(xtrain_glove)
xvalid_glove_pca = pca.transform(xvalid_glove)

In [67]:
# logistic regression with GloVe
clf = LogisticRegression(C = 1.0)
scores = cross_validate(clf, xtrain_glove_pca, y_train, cv=5, scoring = scoring)
acc = scores.get('test_accuracy')
f1 = scores.get('test_f1')
print(f'mean k-fold accuracy: {np.mean(acc)} \nmean k-fold f1: {np.mean(f1)}')

mean k-fold accuracy: 0.5515750453777905 
mean k-fold f1: 0.14086919708197243


In [66]:
# Fitting a simple SVM to GloVe with PCA
clf = SVC(C=1.0, probability=True) # since we need probabilities
scores = cross_validate(clf, xtrain_glove_pca, y_train, cv=5, scoring = scoring)
acc = scores.get('test_accuracy')
f1 = scores.get('test_f1')
print(f'mean k-fold accuracy: {np.mean(acc)} \nmean k-fold f1: {np.mean(f1)}')

mean k-fold accuracy: 0.574671238626802 
mean k-fold f1: 0.29009922092286844


In [68]:
# Fitting a simple xgboost on tf-idf (taking too long)
clf = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
scores = cross_validate(clf, xtrain_glove_pca, y_train, cv=5, scoring = scoring)
acc = scores.get('test_accuracy')
f1 = scores.get('test_f1')
print(f'mean k-fold accuracy: {np.mean(acc)} \nmean k-fold f1: {np.mean(f1)}')

In [33]:
glove_train_data = pd.DataFrame({'features': xtrain_glove, 'trend': y_train})

In [41]:
len(glove_train_data['features'][0])

300

In [42]:
with open('pkls/amzn_train_glove.pkl', 'wb') as f:
    pickle.dump(xtrain_glove, f)