# Gensim/Tfidf
Gensim: 200-dim vectors, 15 epochs

## Usage
* [Doc2Vec tutorial](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb)

In [6]:
%load_ext autoreload
%autoreload 2

In [5]:
from util import const, reader
import gensim
import sklearn.feature_extraction as sk_fe
import nltk

In [6]:
fm_baseline = const.FileManager(prefix='baseline-')

In [7]:
model_loc = str(fm_baseline.model('gensim_200.model'))
load_model = True 


def read_corpus(fname):
  with open(fname) as f:
    for i, line in enumerate(f):
      yield gensim.models.doc2vec.TaggedDocument(
        gensim.utils.simple_preprocess(line), [i])

      
def construct_doc2vec_model(data_loc, model_loc):
  corpus = read_corpus(data_loc)
  model = gensim.models.Doc2Vec(
    corpus, size=200, workers=4, seed=const.SEED, iter=15)
  model.save(model_loc)
  return model


#d2v_model = gensim.models.Doc2Vec.load(model_loc) if load_model\
#  else construct_doc2vec_model(const.Path.TWEETS_RAW_TEXT, model_loc)

In [8]:
%%capture
tfidf_vectorizer = sk_fe.text.TfidfVectorizer(
  stop_words=sk_fe.stop_words.ENGLISH_STOP_WORDS)
# content_tfidf = tfidf_vectorizer.fit_transform(df.input)
#  reader.stream_lines(const.FileManager.TWEETS_RAW_TEXT))

In [9]:
%time df = reader.load_df_multi(clean=True)

CPU times: user 30.9 s, sys: 577 ms, total: 31.5 s
Wall time: 31.7 s


In [11]:
%%capture

import sklearn as sk
import sklearn.model_selection
import numpy as np
import logging
import sklearn as s
import sklearn.pipeline
import logging
from collections import namedtuple
from util import logutil, evaluate
logutil.ignore_warnings()

kfolder = sk.model_selection.KFold(
  n_splits=5, shuffle=True, random_state=const.SEED)

log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)

X = np.array(df.body_content)
y = np.array(df.label)

lr_tfidf = sk.linear_model.LogisticRegression()
tfidf_vectorizer = sk_fe.text.TfidfVectorizer(
  stop_words=sk_fe.stop_words.ENGLISH_STOP_WORDS)

tfidf_pipeline = sk.pipeline.make_pipeline(tfidf_vectorizer, lr_tfidf)

In [15]:
from sklearn.metrics import f1_score

def pipeline_cross_eval(pipeline, scorings):
  for scoring in scorings:
    eval_ = sk.model_selection.cross_val_score(
      pipeline, X, y, cv=kfolder.split(X), scoring=scoring)
    log.debug(eval_)
    yield (scoring, eval_)

list(pipeline_cross_eval(tfidf_pipeline, ['f1_micro', 'f1_macro']))

DEBUG:__main__:[0.96727623 0.96289167 0.96599294 0.96781093 0.96299465]
DEBUG:__main__:[0.81837641 0.82741343 0.84737561 0.85169917 0.82574659]


[('f1_micro',
  array([0.96727623, 0.96289167, 0.96599294, 0.96781093, 0.96299465])),
 ('f1_macro',
  array([0.81837641, 0.82741343, 0.84737561, 0.85169917, 0.82574659]))]

In [16]:
print('micro', np.average([0.96727623, 0.96289167, 0.96599294, 0.96781093, 0.96299465]))
print('macro', np.average([0.81837641, 0.82741343, 0.84737561, 0.85169917, 0.82574659]))

micro 0.9653932839999999
macro 0.834122242


In [17]:
class Doc2Vec(sklearn.pipeline.TransformerMixin):
  def __init__(self):
    self.model = None
  
  def corpus(self, X):
    for i, line in enumerate(X):
      yield gensim.models.doc2vec.TaggedDocument(
        gensim.utils.simple_preprocess(line), [i])
      
  def fit(self, X, y):
    self.model = gensim.models.Doc2Vec(
      list(self.corpus(X)), size=200, workers=4, seed=const.SEED, iter=15)
    return self
  
  def transform(self, X):
    return np.array([self.model.infer_vector(v) for v in X])

lr_doc2vec = sk.linear_model.LogisticRegression()
doc2vec = Doc2Vec()
doc2vec_pipeline = sk.pipeline.make_pipeline(doc2vec, lr_doc2vec)

In [18]:
gensim.logger.setLevel(logging.ERROR)

list(pipeline_cross_eval(doc2vec_pipeline, ['f1_micro', 'f1_macro']))

DEBUG:__main__:[0.86439953 0.85477489 0.86226072 0.85819698 0.85754011]
DEBUG:__main__:[0.23181714 0.23042551 0.23150913 0.23092196 0.23082681]


[('f1_micro',
  array([0.86439953, 0.85477489, 0.86226072, 0.85819698, 0.85754011])),
 ('f1_macro',
  array([0.23181714, 0.23042551, 0.23150913, 0.23092196, 0.23082681]))]

In [20]:
print('micro', np.average([0.86439953, 0.85477489, 0.86226072, 0.85819698, 0.85754011]))
print('macro', np.average([0.23181714, 0.23042551, 0.23150913, 0.23092196, 0.23082681]))

micro 0.8594344459999999
macro 0.23110011
