# Gensim/Tfidf
Gensim: 200-dim vectors, 15 epochs

## Usage
* [Doc2Vec tutorial](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb)

In [6]:
%load_ext autoreload
%autoreload 2

In [20]:
from util import const, reader
import gensim
import sklearn.feature_extraction as sk_fe
import nltk

In [8]:
fm_baseline = const.FileManager(prefix='baseline-')

In [9]:
model_loc = str(fm_baseline.model('gensim_200.model'))
load_model = True 


def read_corpus(fname):
  with open(fname) as f:
    for i, line in enumerate(f):
      yield gensim.models.doc2vec.TaggedDocument(
        gensim.utils.simple_preprocess(line), [i])

      
def construct_doc2vec_model(data_loc, model_loc):
  corpus = read_corpus(data_loc)
  model = gensim.models.Doc2Vec(
    corpus, size=200, workers=4, seed=const.SEED, iter=15)
  model.save(model_loc)
  return model


#d2v_model = gensim.models.Doc2Vec.load(model_loc) if load_model\
#  else construct_doc2vec_model(const.Path.TWEETS_RAW_TEXT, model_loc)

In [222]:
%%capture
tfidf_vectorizer = sk_fe.text.TfidfVectorizer(
  stop_words=sk_fe.stop_words.ENGLISH_STOP_WORDS)
# content_tfidf = tfidf_vectorizer.fit_transform(df.input)
#  reader.stream_lines(const.FileManager.TWEETS_RAW_TEXT))

In [10]:
%time df = reader.load_df()

CPU times: user 54.2 s, sys: 1.91 s, total: 56.1 s
Wall time: 57.1 s


In [11]:
from pprint import pprint
pprint(list(df.keys()))

['index',
 'input',
 'output',
 'children',
 'created_at',
 'favorite_count',
 'id',
 'label',
 'media',
 'parents',
 'reply_count',
 'retweet_count',
 'user_handle',
 'user_id',
 'user_is_verified',
 'user_name',
 'body_atreplies',
 'body_content',
 'body_hashtags',
 'body_links']


In [16]:
%%capture

import sklearn as sk
import sklearn.model_selection
import numpy as np
import logging
import sklearn as s
import sklearn.pipeline
import logging
from collections import namedtuple
from util import logutil, evaluate
logutil.ignore_warnings()

kfolder = sk.model_selection.KFold(
  n_splits=5, shuffle=True, random_state=const.SEED)

log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)

X = np.array(df.input)
y = np.array(df.label)

lr_tfidf = sk.linear_model.LogisticRegression()
tfidf_vectorizer = sk_fe.text.TfidfVectorizer(
  stop_words=sk_fe.stop_words.ENGLISH_STOP_WORDS)

tfidf_pipeline = sk.pipeline.make_pipeline(tfidf_vectorizer, lr_tfidf)

In [21]:
def pipeline_cross_eval(pipeline, scorings):
  for scoring in scorings:
    eval_ = sk.model_selection.cross_val_score(
      pipeline, X, y, cv=kfolder.split(X), scoring=scoring)
    log.debug(eval_)
    yield (scoring, eval_)

list(pipeline_cross_eval(tfidf_pipeline, const.binary_metrics))

DEBUG:__main__:[0.82284228 0.81663329 0.82408242 0.82354895 0.82273016]
DEBUG:__main__:[0.74690431 0.73783742 0.74452101 0.74544041 0.74621762]
DEBUG:__main__:[0.8481869  0.83602701 0.84671611 0.84578432 0.84645574]


[('roc_auc',
  array([0.82284228, 0.81663329, 0.82408242, 0.82354895, 0.82273016])),
 ('accuracy',
  array([0.74690431, 0.73783742, 0.74452101, 0.74544041, 0.74621762])),
 ('average_precision',
  array([0.8481869 , 0.83602701, 0.84671611, 0.84578432, 0.84645574]))]

In [24]:
class Doc2Vec(sklearn.pipeline.TransformerMixin):
  def __init__(self):
    self.model = None
  
  def corpus(self, X):
    for i, line in enumerate(X):
      yield gensim.models.doc2vec.TaggedDocument(
        gensim.utils.simple_preprocess(line), [i])
      
  def fit(self, X, y):
    self.model = gensim.models.Doc2Vec(
      list(self.corpus(X)), size=200, workers=4, seed=const.SEED, iter=15)
    return self
  
  def transform(self, X):
    return np.array([self.model.infer_vector(v) for v in X])

lr_doc2vec = sk.linear_model.LogisticRegression()
doc2vec = Doc2Vec()
doc2vec_pipeline = sk.pipeline.make_pipeline(doc2vec, lr_doc2vec)

In [25]:
gensim.logger.setLevel(logging.ERROR)

list(pipeline_cross_eval(doc2vec_pipeline, const.binary_metrics))

DEBUG:__main__:[0.49700495 0.49595984 0.49878844 0.4970073  0.49308227]
DEBUG:__main__:[0.55478991 0.54111186 0.54401326 0.54751295 0.55150259]
DEBUG:__main__:[0.55058566 0.53957753 0.54379852 0.54195819 0.54683085]


[('roc_auc',
  array([0.49700495, 0.49595984, 0.49878844, 0.4970073 , 0.49308227])),
 ('accuracy',
  array([0.55478991, 0.54111186, 0.54401326, 0.54751295, 0.55150259])),
 ('average_precision',
  array([0.55058566, 0.53957753, 0.54379852, 0.54195819, 0.54683085]))]