# Environment

In [1]:
import os
import psutil
import subprocess
import datetime
import joblib
import copy
import pprint
import google.cloud.bigquery as bigquery
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score
)

In [2]:
import pkg_resources
pkg_resources.get_distribution("scikit-learn").version

'0.21.3'

# Function Definitions

In [3]:
def query_to_dataframe(query):

    client = bigquery.Client()
    df = client.query(query).to_dataframe()

    # label
    df["label"] = df["tags"].apply(lambda row: ",".join(row))
    del df["tags"]

    # features
    df["text"] = df["title"] + df["text_body"] + df["code_body"]
    del df["code_body"]
    del df["title"]
    del df["text_body"]

    # use BigQuery index
    df.set_index("id", inplace=True)

    return df

In [4]:
def input_fn(df):
    label = df['label']
    del df['label']
    
    features = df['text']
    return features, label

# Model Tests

In [5]:
train_query = """
SELECT
  *
FROM
  `nlp-text-classification.stackoverflow.posts_preprocessed`
WHERE
  MOD(ABS(FARM_FINGERPRINT(CAST(id AS STRING))),100) >= 0.2 AND RAND() < 0.02
"""

eval_query = """
SELECT
  *
FROM
  `nlp-text-classification.stackoverflow.posts_preprocessed`
WHERE
  MOD(ABS(FARM_FINGERPRINT(CAST(id AS STRING))),100) <= 0.2 AND RAND() < 0.02
"""

In [6]:
train_df = query_to_dataframe(train_query)
eval_df = query_to_dataframe(eval_query)

In [7]:
train_X, train_y = input_fn(train_df)
eval_X, eval_y = input_fn(eval_df)

In [8]:
print('size of the training data:', len(train_df ))
print('size of the evaluation data:', len(eval_df ))
print(train_df.info(memory_usage="deep"))

size of the training data: 613004
size of the evaluation data: 6140
<class 'pandas.core.frame.DataFrame'>
Index: 613004 entries, 1719637 to 24967669
Data columns (total 1 columns):
text    613004 non-null object
dtypes: object(1)
memory usage: 401.5 MB
None


In [9]:
max_df  =0.75
min_df = 20
norm = 'l2'
alpha = 0.2

## Pipeline 1

In [12]:
pipeline=Pipeline([('Word Embedding', CountVectorizer(max_df=max_df,min_df=min_df)),
                       ('Feature Transform', TfidfTransformer(norm=norm)),
                       ('Classifier', MultinomialNB(alpha=alpha))])
pipeline.fit(train_X, train_y)

KeyboardInterrupt: 

In [None]:
# evaluate
eval_y_pred = pipeline.predict(eval_X)
    
# define the score we want to use to evaluate the classifier on
acc_eval = accuracy_score(eval_y,eval_y_pred)

## Pipeline 2

In [12]:
pipeline2=Pipeline([('tfidf', TfidfVectorizer(tokenizer=lambda string: string.split(), 
                                              min_df=min_df, 
                                              max_df=max_df, 
                                              norm=norm)),
                    ('Classifier', MultinomialNB(alpha=alpha))])
pipeline2.fit(train_X, train_y)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.75, max_features=None,
                                 min_df=20, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function <lambda> at 0x7f43e6a37400>,
                                 use_idf=True, vocabulary=None)),
                ('Classifier',
                 MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True))],
         verbose=False)

## Pipeline 3

In [17]:
vectorizer = TfidfVectorizer()
dtm = vectorizer.fit_transform(train_X)

In [18]:
classifier = MultinomialNB(alpha=alpha)

In [19]:
classifier.fit(dtm, train_y)

MemoryError: Unable to allocate array with shape (7572, 1369169) and data type float64