# Classification

## Training Model

In [69]:
import pandas as pd
import numpy as np
from gensim.corpora import Dictionary
from gensim.utils import tokenize

In [70]:
data = pd.read_csv("data/ArticlesApril2017.csv")

In [71]:
df = data[["articleID", "newDesk", "headline", "keywords", "snippet"]]

In [72]:
df

Unnamed: 0,articleID,newDesk,headline,keywords,snippet
0,58def1347c459f24986d7c80,Insider,Finding an Expansive View of a Forgotten Peop...,"['Photography', 'New York Times', 'Niger', 'Fe...",One of the largest photo displays in Times his...
1,58def3237c459f24986d7c84,OpEd,"And Now, the Dreaded Trump Curse","['United States Politics and Government', 'Tru...",Meet the gang from under the bus.
2,58def9f57c459f24986d7c90,Editorial,Venezuela’s Descent Into Dictatorship,"['Venezuela', 'Politics and Government', 'Madu...",A court ruling annulling the legislature’s aut...
3,58defd317c459f24986d7c95,Sports,Stain Permeates Basketball Blue Blood,"['Basketball (College)', 'University of North ...","For two decades, until 2013, North Carolina en..."
4,58df09b77c459f24986d7ca7,Games,Taking Things for Granted,['Crossword Puzzles'],In which Howard Barkin and Will Shortz teach u...
...,...,...,...,...,...
881,58fd41ab7c459f24986dbaa7,Insider,Reporting on Gays Who ‘Don’t Exist’,"['Chechnya (Russia)', 'Homosexuality and Bisex...","“I see flies, I see mosquitoes,” said a Cheche..."
882,58fd45a17c459f24986dbaaa,National,The Fights That Could Lead to a Government Shu...,"['Trump, Donald J', 'United States Politics an...",The Trump administration wants to use the dead...
883,58fd5c2c7c459f24986dbac3,Culture,"‘The Leftovers’ Season 3, Episode 2: Swedish P...","['Television', 'The Leftovers (TV Program)']","For all its melancholy, “The Leftovers” rarely..."
884,58fd5c3d7c459f24986dbac4,Culture,"Thinking Out Loud, But Why?","['Theater', 'The Antipodes (Play)', 'Baker, An...","In this endlessly fascinating work, Annie Bake..."


In [82]:
tokens = [list(tokenize(row)) for row in df["snippet"].values]

In [90]:
d = Dictionary(tokens)

In [98]:
corpus = [d.doc2bow(row) for row in tokens]

In [103]:
from gensim.models import TfidfModel

In [104]:
tf = TfidfModel(corpus)

In [105]:
tf[corpus[0]]

[(0, 0.30390195405303666),
 (1, 0.30390195405303666),
 (2, 0.2728635618301608),
 (3, 0.15631789078523653),
 (4, 0.10191047150462035),
 (5, 0.11239740187254238),
 (6, 0.30390195405303666),
 (7, 0.2547072662971033),
 (8, 0.30390195405303666),
 (9, 0.11837679536152494),
 (10, 0.200794646907732),
 (11, 0.2728635618301608),
 (12, 0.05811317093519547),
 (13, 0.17703367918987814),
 (14, 0.2728635618301608),
 (15, 0.07966737411773732),
 (16, 0.15631789078523653),
 (17, 0.2547072662971033),
 (18, 0.30390195405303666),
 (19, 0.022924796267902687),
 (20, 0.13149644663785265)]

In [108]:
M = len(tokens)
N = len(d)

In [113]:
X = np.zeros((M, N))
for i, doc in enumerate(corpus):
    for idx, val in tf[doc]:
        X[(i, idx)] = val

In [121]:
id_by_label = dict([(v, i) for i, v in enumerate(set(df["newDesk"].values))])
label_by_id = dict([(v, k) for k, v in id_by_label.items()])

In [124]:
y = [id_by_label[label] for label in df["newDesk"].values]

In [126]:
from sklearn.naive_bayes import GaussianNB

In [146]:
model = GaussianNB()

In [171]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [147]:
model.fit(X, y)

GaussianNB()

## Testing model

In [160]:
text = "The Trump is calling for war"

test_corpus = d.doc2bow(list(tokenize(text)))

Xs = np.zeros((1, N))
for idx, val in tf[test_corpus]:
    Xs[(0, idx)] = val

In [161]:
model.predict(Xs)

array([6])

In [163]:
label_by_id[6]

'Learning'