In [22]:
import pandas as pd 
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
stop_words = stopwords.words('english')
import xgboost as xgb
import pickle

In [39]:
# train = pd.read_csv('train.csv')
amzn_train = pd.read_pickle("amazon_data_for_training.pkl", compression='infer')
amzn_train.head()
train = amzn_train[['article_content', 'trend']]
train.columns = ['text', 'target']
train.head()

Unnamed: 0,text,target
12886,here are some things going on today in your wo...,1
13571,shutterstock photo\nstocks indexes opened the ...,1
11837,by ryan vlastelica\nto simply match the market...,1
13804,what happened shares of many optical networkin...,1
15737,by nigam arora\nthe practical way to take adva...,1


In [30]:
train.shape

(5677, 2)

In [5]:
# split train data into train and validation sets
x_train, x_valid, y_train, y_valid = train_test_split(train.text.values, train.target.values,
                                                        stratify = train.target.values,
                                                        random_state = 42,
                                                        test_size = 0.1,
                                                        shuffle = True)

In [6]:
# tf-idf vectorization
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

tfv.fit(list(x_train) + list(x_valid))
xtrain_tfv = tfv.transform(x_train)
xvalid_tfv = tfv.transform(x_valid)

In [8]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(x_train) + list(x_valid))
xtrain_ctv =  ctv.transform(x_train) 
xvalid_ctv = ctv.transform(x_valid)

In [7]:
# logistic regression with tf-idf
clf = LogisticRegression(C = 1.0)
clf.fit(xtrain_tfv, y_train)
predictions = clf.predict(xvalid_tfv)
accuracy_score(predictions, y_valid)

0.7165492957746479

In [11]:
# logistic regression with n-gram
clf = LogisticRegression(C = 1.0)
clf.fit(xtrain_ctv, y_train)
predictions = clf.predict(xvalid_ctv)
accuracy_score(predictions, y_valid)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7095070422535211

In [14]:
# Fitting a simple Naive Bayes on tf-idf
clf = MultinomialNB()
clf.fit(xtrain_tfv, y_train)
predictions = clf.predict(xvalid_tfv)
accuracy_score(predictions, y_valid)

0.6778169014084507

In [15]:
# Fitting a simple Naive Bayes on n-gram
clf = MultinomialNB()
clf.fit(xtrain_ctv, y_train)
predictions = clf.predict(xvalid_ctv)
accuracy_score(predictions, y_valid)

0.721830985915493

# GloVe

In [16]:
# load the GloVe vectors in a dictionary:

embeddings_index = {}
f = open('glove.840B.300d/glove.840B.300d.txt')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    except:
        pass
f.close()

print('Found %s word vectors.' % len(embeddings_index))

2196017it [01:36, 22647.58it/s]

Found 2195884 word vectors.





In [17]:
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [18]:
# create sentence vectors using the above function for training and validation set
xtrain_glove = [sent2vec(x) for x in tqdm(x_train)]
xvalid_glove = [sent2vec(x) for x in tqdm(x_valid)]

100%|██████████| 5109/5109 [00:40<00:00, 127.01it/s]
100%|██████████| 568/568 [00:04<00:00, 130.89it/s]


In [19]:
# logistic regression with GloVe
clf = LogisticRegression(C = 1.0)
clf.fit(xtrain_glove, y_train)
predictions = clf.predict(xvalid_glove)
accuracy_score(predictions, y_valid)

0.5774647887323944

In [46]:
glove_train_data = pd.DataFrame({'features': xtrain_glove, 'trend': y_train})

In [45]:
with open('pkls/amzn_train_glove.pkl', 'wb') as f:
    pickle.dump(xtrain_glove, f)