# Text classification starter

## Import libraries

In [1]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
# import nltk
# nltk.download()


## Import data

In [2]:
# get training data
data_train = fetch_20newsgroups(subset='train', shuffle=True)
data_test = fetch_20newsgroups(subset='test', shuffle=True)

## Data exploration

In [3]:
type(data_train)
data_train.keys()
data_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
#prints all the categories
set(data_train.target) 

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}

In [5]:
#prints first line of the first data file
print("\n".join(data_train.data[0].split("\n")[:])) 

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







## Data preprocessing

In [6]:
# Vectorization
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(data_train.data)
X_train_counts.shape
# n_samples (length of traning data, documents), n_features (number of unique words)


(11314, 130107)

In [7]:
len(data_train.data)

11314

In [8]:
X_train_counts.toarray().shape
# count_vect.get_feature_names()# all the different unique words

(11314, 130107)

In [9]:
# TF - Term Frequencies
# TF-IDF i.e Term Frequency times inverse document frequency

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape


(11314, 130107)

## Algo1 - Naive base

### Train

In [10]:
# text classifier
# naive base
# pipeline this instead
# clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [11]:
# pipeline
text_clf_nb = Pipeline([('vect', CountVectorizer()), # attribute stop_words ("then, the, etc.")
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()), # FitPrior=False, a uniform prior will be used
                    ])

In [12]:
text_clf_nb = text_clf_nb.fit(data_train.data, data_train.target)

### Test

In [13]:
predicted = text_clf_nb.predict(data_test.data)
np.mean(predicted == data_test.target)

0.7738980350504514

In [14]:
text_clf_nb.score(data_test.data, data_test.target)

0.7738980350504514

## Algo2 - Support Vector Machines

### Train

In [15]:
# pipeline
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf-svm', SGDClassifier(loss='hinge', 
                                               penalty='l2',
                                               alpha=1e-3, 
#                                                n_iter=5, 
                                               random_state=42)),
                    ])

In [16]:
text_clf_svm = text_clf_svm.fit(data_train.data, data_train.target)

### Test

In [17]:
predicted_svm = text_clf_svm.predict(data_test.data)
np.mean(predicted_svm == data_test.target)

0.8240839086563994

In [18]:
text_clf_svm.score(data_test.data, data_test.target)

0.8240839086563994

## Grid Search

In [19]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
...               'tfidf__use_idf': (True, False),
...               'clf__alpha': (1e-2, 1e-3),
... }

In [20]:
gs_clf_nb = GridSearchCV(text_clf_nb, parameters, n_jobs=-1)
gs_clf_nb = gs_clf_nb.fit(data_train.data, data_train.target)

In [21]:
gs_clf_nb.best_score_

0.9157684864695698

In [22]:
gs_clf_nb.best_score_
gs_clf_nb.best_params_

{'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [23]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
...               'tfidf__use_idf': (True, False),
...               'clf-svm__alpha': (1e-2, 1e-3),
... }

In [24]:
gs_clf_svm = GridSearchCV(text_clf_svm, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(data_train.data, data_train.target)

In [25]:
gs_clf_svm.best_score_

0.9051618841994754

In [26]:
gs_clf_svm.best_params_

{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}