### Import

In [0]:
import pandas as pd
import numpy as np

In [0]:
import pickle
import sys
import nltk
from nltk.stem.porter import *

from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
data = pd.read_csv('/content/drive/My Drive/Shaastra Workshop Material/Text Analysis/spam.csv',encoding = "ISO-8859-1")

In [0]:
data.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis=1, inplace=True)

In [0]:
data.rename(columns={'v1':'target','v2':'text'},inplace=True)

In [0]:
data.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Basic Preprocessing

In [0]:
train = data.copy()

train['text'] = train['text'].astype(str)

In [0]:
### Fill your code here

## 1. Lowercasing
train['text'] = train['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

## 2. Punctuation Removal
train['text'] = train['text'].str.replace('[^\w\s]','')

## 3. Stopwords Removal
nltk.download('stopwords')
## for removing the stopwords
from nltk.corpus import stopwords

stop = stopwords.words('english')
train['text'] = train['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['text'].head()
""" 

fill your code for all these operations

"""

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


' \n\nfill your code for all these operations\n\n'

In [0]:
## lemmatization
## write your code here

from textblob import Word
nltk.download('wordnet')
train['text'] = train['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
train['text'].head()


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry 2 wkly comp win fa cup final tkts 2...
3                  u dun say early hor u c already say
4             nah dont think go usf life around though
Name: text, dtype: object

### Target creation

In [0]:
train['target'].unique()#.isna().sum()

array(['ham', 'spam'], dtype=object)

In [0]:
train['target'].replace(['ham','spam'],[0,1],inplace=True)
train['target'].value_counts()

0    4825
1     747
Name: target, dtype: int64

In [0]:
train

Unnamed: 0,target,text
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry 2 wkly comp win fa cup final tkts 2...
3,0,u dun say early hor u c already say
4,0,nah dont think go usf life around though
...,...,...
5567,1,2nd time tried 2 contact u u å750 pound prize ...
5568,0,ì_ b going esplanade fr home
5569,0,pity mood soany suggestion
5570,0,guy bitching acted like id interested buying s...


### Basic Model on whole dataset

In [0]:
from sklearn.model_selection import train_test_split
x_train,x_valid,y_t,y_v = train_test_split(train['text'],train['target'],test_size=0.2,random_state=4353)

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=100, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))

tfidf.fit(train['text'])

x_t = tfidf.transform(x_train)
x_v = tfidf.transform(x_valid)


In [0]:
x_v.toarray().shape

(1115, 100)

In [0]:
def model_training(clf, x_t, y_t, x_v=None , y_v=None ,task='binary:logistic'):
    
    clf.fit(x_t,y_t)
    
    print('training accuracy', clf.score(x_t,y_t))
    
    if task=='binary:logistic':
      print('validation accuracy', clf.score(x_v,y_v))
      print('validation f1_score',f1_score(clf.predict(x_v),y_v))
      print('validation roc_auc score',roc_auc_score(y_v,clf.predict_proba(x_v)[::,-1]))
      print('confusion matrix \n',confusion_matrix(y_v, clf.predict(x_v)))
    
    if task=='reg:linear':
        if x_v!=None:
            print('validation r2_score', clf.score(x_v,y_v))
            print('validation MSE',mean_squared_error(clf.predict(x_v),y_v))

            
    return clf

In [0]:
train['target'].mean()

0.13406317300789664

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, classification_report, roc_auc_score, confusion_matrix, accuracy_score
from xgboost import XGBClassifier

lgr =  LogisticRegression(n_jobs=1)
# model_training(lgr,x_t,y_t,x_v,y_v)

xgb = XGBClassifier(n_estimators=500, max_depth=5,learning_rate=0.1,scale_pos_weight=1.4266790777602751)
# xgb = XGBClassifier(n_estimators=500, max_depth=5,learning_rate=0.1,scale_pos_weight=1.6760372565622355)

model_training(lgr,x_t,y_t,x_v,y_v)


training accuracy 0.9571460623737941
validation accuracy 0.9560538116591928
validation f1_score 0.7966804979253114
validation roc_auc score 0.9571164021164021
confusion matrix 
 [[970  10]
 [ 39  96]]


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
# training accuracy 0.974646623289208
# validation accuracy 0.967713004484305
# validation f1_score 0.859375
# validation roc_auc score 0.9588548752834467
# confusion matrix 
#  [[969  11]
#  [ 25 110]]
# XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#               colsample_bynode=1, colsample_bytree=1, gamma=0,
#               learning_rate=0.1, max_delta_step=0, max_depth=5,
#               min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
#               nthread=None, objective='binary:logistic', random_state=0,
#               reg_alpha=0, reg_lambda=1, scale_pos_weight=1.4266790777602751,
#               seed=None, silent=None, subsample=1, verbosity=1)

### Training on pretrained word2vec model

In [0]:
import gensim
import logging
from gensim.models import Word2Vec

wv = gensim.models.KeyedVectors.load_word2vec_format("/content/drive/My Drive/Shaastra Workshop Material/Text Analysis/GoogleNews-vectors-negative300.bin.gz", binary=True)
# above is only pretrained embeddings
# wv = gensim.models.KeyedVectors.load_word2vec_format("tmp.txt")

wv.init_sims(replace=True)


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

In [0]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens
    
train_w2v, test_w2v = train_test_split(train, test_size=0.2, random_state = 234)
# x_t,x_v,y_t,y_v = train_test_split(train['tweet'],train['class'],test_size=0.2,random_state=234)

test_tokenized = test_w2v.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values
train_tokenized = train_w2v.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values

X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


  
  if np.issubdtype(vec.dtype, np.int):


In [0]:
wv.similarity(w1='queen',w2='royal')

  if np.issubdtype(vec.dtype, np.int):


0.56371856

In [0]:
wv.similarity(w1='lion',w2='cub')

  if np.issubdtype(vec.dtype, np.int):


0.40660018

In [0]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from lightgbm import LGBMClassifier

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, classification_report, roc_auc_score, confusion_matrix, accuracy_score

In [0]:
def model_training(clf, x_t, y_t, x_v=None , y_v=None ,task='binary:logistic'):
    clf.fit(x_t,y_t)
    print('training accuracy', clf.score(x_t,y_t))
    
    if task=='binary:logistic':
      print('validation accuracy', clf.score(x_v,y_v))
      print('validation f1_score',f1_score(clf.predict(x_v),y_v))
      print('validation roc_auc score',roc_auc_score(y_v,clf.predict_proba(x_v)[::,-1]))
      print('confusion matrix \n',confusion_matrix(y_v, clf.predict(x_v)))
    
    if task=='reg:linear':
      print('validation r2_score', clf.score(x_v,y_v))
      print('validation MSE',mean_squared_error(clf.predict(x_v),y_v))

            
    return clf

In [0]:
%%time
xgb_w2v = XGBClassifier(n_estimators=500, max_depth=5,learning_rate=0.1,scale_pos_weight=1.4266790777602751)
lgr_w2v = LogisticRegression(n_jobs=1)

lgbm_w2v = LGBMClassifier(n_estimators=500)

gbdt_w2v = GradientBoostingClassifier(n_estimators=500)

model_training(xgb_w2v,X_train_word_average,train_w2v['target'],X_test_word_average,test_w2v['target'])

# model_training(lgr_w2v,X_train_word_average,train_w2v['class'],X_test_word_average,test_w2v['class'])

# model_training(lgbm_w2v,X_train_word_average,train_w2v['class'],X_test_word_average,test_w2v['class'])

# model_training(gbdt_w2v,X_train_word_average,train_w2v['class'],X_test_word_average,test_w2v['class'])


training accuracy 0.9993269015032533
validation accuracy 0.9721973094170404
validation f1_score 0.9034267912772586
validation roc_auc score 0.9867553771434235
confusion matrix 
 [[939   5]
 [ 26 145]]
CPU times: user 40.9 s, sys: 36.6 ms, total: 41 s
Wall time: 41 s


In [0]:
tuned_pred = (xgb_w2v.predict_proba(X_test_word_average)[::,-1]>0.3).astype(int)
confusion_matrix(test_w2v['target'],tuned_pred)

array([[937,   7],
       [ 24, 147]])

In [0]:
f1_score(test_w2v['target'],tuned_pred)

0.9046153846153846

### Word2Vec DOMAIN Training

In [0]:
# imports needed and set up logging
import gzip
import gensim 
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [0]:
def read_input(data):
    """This method reads the input file which is in gzip format"""
    i=0
    for line in data['text']: 
      i+=1
      if (i%10000==0):
        logging.info ("read {0} tweets".format(i))
      # do some pre-processing and return a list of words for each tweet, basically doing tokenizing
      yield gensim.utils.simple_preprocess (line)

# read the tokenized reviews into a list
# each review item becomes a serries of words
# so this becomes a list of lists
documents = list(read_input(train))
logging.info ("Done reading data file")

In [0]:
documents = []
i=0
for line in train['text']: 
    i+=1
    if (i%1000==0):
      logging.info ("read {0} messages".format(i))
      # do some pre-processing and return a list of words for each tweet, basically doing tokenizing
    # documents.append(gensim.utils.simple_preprocess(line))
    documents.append(nltk.word_tokenize(line))

Training Time

In [0]:
%%time

model = gensim.models.Word2Vec(documents, size=50, min_count=2, workers=5)
model.train(documents,total_examples=len(documents),epochs=10)





CPU times: user 2.55 s, sys: 29.9 ms, total: 2.58 s
Wall time: 1.76 s


In [0]:
w1 = "discount"
model.wv.most_similar(positive=w1)

  if np.issubdtype(vec.dtype, np.int):


[('125gift', 0.9947423934936523),
 ('voucher', 0.9938306212425232),
 ('08000407165', 0.9904675483703613),
 ('operator', 0.9898449182510376),
 ('10p', 0.9889860153198242),
 ('40gb', 0.9866666793823242),
 ('ipod', 0.9851861000061035),
 ('entry', 0.9846733808517456),
 ('mobile', 0.984646737575531),
 ('rental', 0.9843345880508423)]

In [0]:
model.wv.save_word2vec_format('tmp.txt', binary=False)
## save the model

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
wv = gensim.models.KeyedVectors.load_word2vec_format('tmp.txt')
wv.init_sims(replace=True)


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
train_w2v, test_w2v = train_test_split(train, test_size=0.2, random_state = 234)
# x_t,x_v,y_t,y_v = train_test_split(train['tweet'],train['class'],test_size=0.2,random_state=234)

test_tokenized = test_w2v.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values
train_tokenized = train_w2v.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values

X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)

  
  if np.issubdtype(vec.dtype, np.int):


In [0]:
%%time
xgb_w2v = XGBClassifier(n_estimators=500, max_depth=5,learning_rate=0.1,scale_pos_weight=1.4266790777602751)
lgr_w2v = LogisticRegression(n_jobs=1)

lgbm_w2v = LGBMClassifier(n_estimators=500)

gbdt_w2v = GradientBoostingClassifier(n_estimators=500)

model_training(xgb_w2v,X_train_word_average,train_w2v['target'],X_test_word_average,test_w2v['target'])

# model_training(lgr_w2v,X_train_word_average,train_w2v['class'],X_test_word_average,test_w2v['class'])

# model_training(lgbm_w2v,X_train_word_average,train_w2v['class'],X_test_word_average,test_w2v['class'])

# model_training(gbdt_w2v,X_train_word_average,train_w2v['class'],X_test_word_average,test_w2v['class'])


training accuracy 0.9997756338344178
validation accuracy 0.9704035874439462
validation f1_score 0.8984615384615385
validation roc_auc score 0.9761373773416593
confusion matrix 
 [[936   8]
 [ 25 146]]
CPU times: user 8.14 s, sys: 24 ms, total: 8.16 s
Wall time: 8.18 s


### Transfer Learning

In [0]:
from gensim.models import Word2Vec
sentences = documents
# size option needs to be set to 300 to be the same as Google's pre-trained model
 
word2vec_model = Word2Vec(size = 300, window=5,
min_count = 1, workers = 10)

word2vec_model.build_vocab(sentences)
 
# assign the vectors to the vocabs that are in Google's pre-trained model and your sentences defined above.


In [0]:
word2vec_model.intersect_word2vec_format('/content/drive/My Drive/Shaastra Workshop Material/Text Analysis/GoogleNews-vectors-negative300.bin.gz', lockf=1.0, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
# continue training with you own data
word2vec_model.train(sentences, total_examples=len(sentences), epochs = 5)



(242893, 262305)

In [0]:
w1 = ["sale"]
word2vec_model.wv.most_similar (positive=w1)

  if np.issubdtype(vec.dtype, np.int):


[('sell', 0.6767012476921082),
 ('purchase', 0.6712290048599243),
 ('selling', 0.6567376852035522),
 ('auction', 0.6562143564224243),
 ('sold', 0.6543912887573242),
 ('buying', 0.5567315816879272),
 ('price', 0.5454237461090088),
 ('buy', 0.5431312918663025),
 ('buyer', 0.5379477739334106),
 ('bought', 0.5367680191993713)]

In [0]:
word2vec_model.wv.save_word2vec_format('model_transfer_learning.txt', binary=False)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
#### Training on this

In [0]:

wv = gensim.models.KeyedVectors.load_word2vec_format('model_transfer_learning.txt')
wv.init_sims(replace=True)

train_w2v, test_w2v = train_test_split(train, test_size=0.2, random_state = 234)
# x_t,x_v,y_t,y_v = train_test_split(train['tweet'],train['class'],test_size=0.2,random_state=234)

test_tokenized = test_w2v.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values
train_tokenized = train_w2v.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values

X_train_word_average = word_averaging_list(wv,train_tokenized)
X_test_word_average = word_averaging_list(wv,test_tokenized)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  
  if np.issubdtype(vec.dtype, np.int):


In [0]:
%%time
xgb_w2v = XGBClassifier(n_estimators=500, max_depth=5,learning_rate=0.1,scale_pos_weight=1.4266790777602751)
lgr_w2v = LogisticRegression(n_jobs=1)

lgbm_w2v = LGBMClassifier(n_estimators=500)

gbdt_w2v = GradientBoostingClassifier(n_estimators=500)

model_training(xgb_w2v,X_train_word_average,train_w2v['target'],X_test_word_average,test_w2v['target'])

# model_training(lgr_w2v,X_train_word_average,train_w2v['class'],X_test_word_average,test_w2v['class'])

# model_training(lgbm_w2v,X_train_word_average,train_w2v['class'],X_test_word_average,test_w2v['class'])

# model_training(gbdt_w2v,X_train_word_average,train_w2v['class'],X_test_word_average,test_w2v['class'])


training accuracy 1.0
validation accuracy 0.9766816143497757
validation f1_score 0.9202453987730062
validation roc_auc score 0.9848597482406583
confusion matrix 
 [[939   5]
 [ 21 150]]
CPU times: user 31.5 s, sys: 36.2 ms, total: 31.5 s
Wall time: 31.6 s


In [0]:
### indeed an increase in F1-Score