In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Text Classification

This is the data from News Articales and their topics,
our task is to make a text classifier to detect the topic of an news articale.

In [3]:
df = pd.read_csv('News_Articales.csv', index_col=0)

In [4]:
df.head()

Unnamed: 0,Text,Target
0,From: Mamatha Devineni Ratnam <mr47+@andrew.cm...,rec.sport.hockey
1,From: mblawson@midway.ecn.uoknor.edu (Matthew ...,comp.sys.ibm.pc.hardware
2,From: hilmi-er@dsv.su.se (Hilmi Eren)\r\nSubje...,talk.politics.mideast
3,From: guyd@austin.ibm.com (Guy Dawson)\r\nSubj...,comp.sys.ibm.pc.hardware
4,From: Alexander Samuel McDiarmid <am2o+@andrew...,comp.sys.mac.hardware


In [5]:
print(df.sample()['Text'].iloc[0])

From: tsmith@cs.stanford.edu (Todd Michael Smith)
Subject: God-shaped hole (was Re: "Accepting Jeesus in your heart...")
Organization: Computer Science Department, Stanford University.
Lines: 16

In article <Apr.14.03.07.38.1993.5420@athos.rutgers.edu>, johnsd2@rpi.edu (Dan Johnson) writes:

|> >Those who have an empty spot in the God-shaped hole in their hearts must 
|> >do something to ease the pain.
|> 
|> I have heard this claim quite a few times. Does anybody here know
|> who first came up with the "God-shaped hole" business?
|> 

Was it Pascal, or maybe Descartes, who first used this figure of speech? 
I seem to have some vague recollections from reading some of their essays,
but I certainly couldn't say it was one of them for sure.

----
Todd Smith
tsmith@cs.stanford.edu



## Data Cleaning

You may need to clean data for a bit

You can also not clean the data, your choice!

Before deciding how to clean the data, check parameters of `TF-IDF` and see what options it provides

In [14]:
print(df['Text'][2])

From: hilmi-er@dsv.su.se (Hilmi Eren)
Subject: Re: ARMENIA SAYS IT COULD SHOOT DOWN TURKISH PLANES (Henrik)
Lines: 95
Nntp-Posting-Host: viktoria.dsv.su.se
Reply-To: hilmi-er@dsv.su.se (Hilmi Eren)
Organization: Dept. of Computer and Systems Sciences, Stockholm University




|>The student of "regional killings" alias Davidian (not the Davidian religios sect) writes:


|>Greater Armenia would stretch from Karabakh, to the Black Sea, to the
|>Mediterranean, so if you use the term "Greater Armenia" use it with care.


	Finally you said what you dream about. Mediterranean???? That was new....
	The area will be "greater" after some years, like your "holocaust" numbers......




|>It has always been up to the Azeris to end their announced winning of Karabakh 
|>by removing the Armenians! When the president of Azerbaijan, Elchibey, came to 
|>power last year, he announced he would be be "swimming in Lake Sevan [in 
|>Armeniaxn] by July".
		*****
	Is't July in USA 

## TF-IDF

Make a pipeline using `TF-IDF` and `LogisticRegression`.

In [18]:
from sklearn.preprocessing import LabelEncoder

In [19]:
encoder = LabelEncoder()
y = df['Target']
encoder.fit_transform(y)

array([10,  3, 17, ...,  3,  1,  7], dtype=int64)

In [8]:
X = df['Text']

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
steps = [('tfidf', TfidfVectorizer(lowercase=True, stop_words='english', max_features=500)),
        ('clf', LogisticRegression())]

tf_lr = Pipeline(steps)
tf_lr.fit(X_train, y_train)



train_predictions = tf_lr.predict(X_train)
test_predictions = tf_lr.predict(X_test)

In [15]:
print("Train:")
print(classification_report(train_predictions, y_train))

Train:
                          precision    recall  f1-score   support

             alt.atheism       0.72      0.71      0.72       702
           comp.graphics       0.67      0.67      0.67       822
 comp.os.ms-windows.misc       0.72      0.74      0.73       801
comp.sys.ibm.pc.hardware       0.63      0.66      0.64       813
   comp.sys.mac.hardware       0.68      0.72      0.70       760
          comp.windows.x       0.73      0.69      0.71       870
            misc.forsale       0.81      0.78      0.79       857
               rec.autos       0.71      0.73      0.72       823
         rec.motorcycles       0.76      0.74      0.75       887
      rec.sport.baseball       0.73      0.71      0.72       862
        rec.sport.hockey       0.83      0.80      0.82       878
               sci.crypt       0.87      0.90      0.89       809
         sci.electronics       0.58      0.60      0.59       810
                 sci.med       0.72      0.58      0.65      1044
  

In [16]:
print("Test:")
print(classification_report(test_predictions, y_test))

Test:
                          precision    recall  f1-score   support

             alt.atheism       0.59      0.56      0.57       115
           comp.graphics       0.53      0.57      0.55       138
 comp.os.ms-windows.misc       0.73      0.71      0.72       161
comp.sys.ibm.pc.hardware       0.54      0.48      0.51       153
   comp.sys.mac.hardware       0.57      0.70      0.63       132
          comp.windows.x       0.62      0.67      0.65       157
            misc.forsale       0.66      0.67      0.67       147
               rec.autos       0.66      0.65      0.65       151
         rec.motorcycles       0.71      0.69      0.70       131
      rec.sport.baseball       0.66      0.64      0.65       163
        rec.sport.hockey       0.74      0.69      0.71       155
               sci.crypt       0.81      0.88      0.84       140
         sci.electronics       0.50      0.55      0.53       137
                 sci.med       0.61      0.51      0.55       181
   

## GridSearch

Now grid search through hyper paramters of these 2 models and save the best estimator.

In [17]:
params = {"tfidf__stop_words":[None, 'english'],
          "tfidf__max_df":[1.0, .95, .6],
          "tfidf__max_features":[None, 50, 100],
          "tfidf__use_idf":[True, False],
          "tfidf__sublinear_tf":[True, False],
          "clf__penalty": ['l2', 'l1'],
          "clf__C": [0.1, 1.0, 10.0]}

gs_pipe = GridSearchCV(tf_lr, params, scoring='f1_macro',
                      cv=2, verbose=1)

gs_pipe.fit(X_train, y_train)

Fitting 2 folds for each of 432 candidates, totalling 864 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=500, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'tfidf__stop_words': [None, 'english'], 'tfidf__max_df': [1.0, 0.95, 0.6], 'tfidf__max_features': [None, 50, 100], 'tfidf__use_idf': [True, False], 'tfidf__sublinear_tf': [True, False], 'clf__penalty': ['l2', 'l1'], 'clf__C': [0.1, 1.0, 10.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_macro', verbose=1)

In [20]:
print("Test:")
print(classification_report(gs_pipe.predict(X_test), y_test))

Test:
                          precision    recall  f1-score   support

             alt.atheism       0.94      0.92      0.93       111
           comp.graphics       0.90      0.88      0.89       152
 comp.os.ms-windows.misc       0.87      0.87      0.87       156
comp.sys.ibm.pc.hardware       0.82      0.75      0.78       148
   comp.sys.mac.hardware       0.90      0.90      0.90       160
          comp.windows.x       0.89      0.94      0.91       160
            misc.forsale       0.85      0.88      0.86       144
               rec.autos       0.97      0.91      0.94       158
         rec.motorcycles       0.93      0.99      0.96       119
      rec.sport.baseball       0.99      0.98      0.99       162
        rec.sport.hockey       0.98      0.98      0.98       145
               sci.crypt       0.96      0.99      0.98       147
         sci.electronics       0.90      0.88      0.89       155
                 sci.med       0.95      0.96      0.96       150
   

In [21]:
gs_pipe.best_params_

{'clf__C': 10.0,
 'clf__penalty': 'l2',
 'tfidf__max_df': 1.0,
 'tfidf__max_features': None,
 'tfidf__stop_words': 'english',
 'tfidf__sublinear_tf': True,
 'tfidf__use_idf': True}

In [25]:
from sklearn.externals import joblib

In [26]:
article_detector = gs_pipe.best_estimator_

In [27]:
joblib.dump(article_detector, "News_class.pkl")

['News_class.pkl']

# Congradulations! You have done it!