# Text Classification

Improving text classifiction using LSI/NMF

In [None]:
import pandas as pd
from cytoolz import identity
import spacy

import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
from sklearn.linear_model import *
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import *
from sklearn.decomposition import *
from sklearn.model_selection import *

In [None]:
nlp = spacy.load('en')

## Load data

In [None]:
df = pd.read_csv("../input/phraserdata/phraser_data.csv")

In [None]:
def tokenize(text):
    return [tok.lower_ for tok in nlp.tokenizer(text) if tok.is_alpha and not tok.is_stop]
df['tokens'] = df['review_text'].apply(tokenize)

## Latent Semantic Indexing

In [None]:
lsi = make_pipeline(CountVectorizer(analyzer=identity, max_df=0.5), 
                    TfidfTransformer(norm='l2', use_idf=True),
                    TruncatedSVD(2))

In [None]:
M = lsi.fit_transform(df['tokens'])

In [None]:
M.shape

In [None]:
plt.scatter(M[:,0],M[:,1], s=2, alpha=0.05)

In [None]:
lsi = make_pipeline(CountVectorizer(analyzer=identity, max_df=0.5), 
                    TfidfTransformer(norm='l2', use_idf=True),
                    TruncatedSVD(300))

In [None]:
lsi.fit(df['tokens'])

In [None]:
lsi.named_steps['truncatedsvd'].components_.shape

In [None]:
V = lsi.named_steps['countvectorizer'].get_feature_names()
for d in range(10):
    D = list(reversed(lsi.named_steps['truncatedsvd'].components_[d].argsort()))
    print(d, ':', end=' ')
    for i in D[:10]:
        print(V[i], end = ' ')
    print()

## Non-negative matrix factorization

In [None]:
nmf = make_pipeline(CountVectorizer(analyzer=identity, max_df=0.5), 
                    TfidfTransformer(norm='l2', use_idf=True),
                    NMF(2))

In [None]:
M = nmf.fit_transform(df['tokens'])

In [None]:
plt.scatter(M[:,0],M[:,1], s=2, alpha=0.05)

In [None]:
nmf = make_pipeline(CountVectorizer(analyzer=identity, max_df=0.5), 
                    TfidfTransformer(norm='l2', use_idf=True),
                    NMF(50))
nmf.fit(df['tokens'])

In [None]:
V = nmf.named_steps['countvectorizer'].get_feature_names() 
for d in range(10):
    D = list(reversed(nmf.named_steps['nmf'].components_[d].argsort()))
    print(d, ':', end=' ')
    for i in D[:10]:
        print(V[i], end = ' ')
    print()

----

## GloVe

We'll try using GloVe vectors from Stanford NLP as features for classification 

In [None]:
doc = nlp(df['review_text'].iloc[0])
doc.vector

In [None]:
df['vec'] = df['review_text'].apply(lambda t: nlp(t).vector)

In [None]:
df['vec'] = df['review_text'].apply(lambda t: nlp(t).vector)

In [None]:
baseline = make_pipeline(CountVectorizer(analyzer=identity), LogisticRegression())
baseline.fit(train['tokens'], train['wine_variant'])
baseline.score(test['tokens'], test['wine_variant'])

In [None]:
model = LogisticRegression(C=10)
model.fit(list(train['vec']), train['wine_variant'])
model.score(list(test['vec']),test['wine_variant'] )