In [1]:
import numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
import nltk
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("english-train.csv")

In [3]:
data.head(10)

Unnamed: 0,headline,Label
0,Former New Zealand cricketer Chris Cairns diag...,Sports
1,American skater Nathan Chen dazzles in his Oly...,Sports
2,La Liga: Enes Unal scores brace to lead Getafe...,Sports
3,U-19 World Cup: Australia beat Afghanistan to ...,Sports
4,ICC U-19 World Cup India vs England final Live...,Sports
5,Brainsqueeze: Know your Snow,Sports
6,Justin Langer steps down as Australia coach,Sports
7,From Ratnakar Shetty’s memoirs: Apparently Vir...,Sports
8,I think ODI revolution happened with 1996 Worl...,Sports
9,U-19 World Cup: Boxer’s son Nishant Sindhu wit...,Sports


In [4]:
def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=stopwords.words('english')):
    
    ## clean (convert to lowercase and remove punctuations and   
    # characters and then strip)
    text = re.sub('[0-9]', ' ', text)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r"\b[a-zA-Z]\b", ' ', text)
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [5]:
data['headline'] = data.headline.apply(lambda x: 
          preprocess_text(x))

In [6]:
def count_length():
    data['word_count'] = data['headline'].apply(lambda x: len(str(x).split(" ")))

In [7]:
count_length()

In [8]:
data.head(10)

Unnamed: 0,headline,Label,word_count
0,former new zealand cricketer chris cairn diagn...,Sports,9
1,american skater nathan chen dazzle olympic return,Sports,7
2,la liga ene unal score brace lead getafes win ...,Sports,10
3,world cup australia beat afghanistan claim rd ...,Sports,8
4,icc world cup india v england final live strea...,Sports,10
5,brainsqueeze know snow,Sports,3
6,justin langer step australia coach,Sports,5
7,ratnakar shettys memoir apparently virat unhap...,Sports,9
8,think odi revolution happened world cup tendul...,Sports,10
9,world cup boxer son nishant sindhu deadly left,Sports,8


In [9]:
train, test = train_test_split(data,test_size=0.15,random_state=42)

In [10]:
train.shape

(31785, 3)

In [11]:
test.shape

(5610, 3)

In [49]:
model = make_pipeline(TfidfVectorizer(lowercase=False), MultinomialNB(alpha=0.1))
model.fit(train.headline, train.Label)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(lowercase=False)),
                ('multinomialnb', MultinomialNB(alpha=0.1))])

In [50]:
predicted_categories = model.predict(test.headline)

In [51]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print("The accuracy is {}".format(accuracy_score(test.Label, predicted_categories)*100))

The accuracy is 95.0445632798574


In [52]:
import joblib
import os
loaded_model = joblib.dump(model, 'eng_nb.pkl')

In [53]:
mat = confusion_matrix(predicted_categories, test.Label)
mat

array([[1125,    5,   27,   14,   50],
       [   4, 1063,   12,   25,   18],
       [  17,    6, 1126,   12,    7],
       [   9,   10,    8, 1041,   10],
       [  26,   12,    1,    5,  977]], dtype=int64)

In [54]:
report = classification_report(test.Label, predicted_categories)
print(report)

               precision    recall  f1-score   support

     Business       0.92      0.95      0.94      1181
Entertainment       0.95      0.97      0.96      1096
     Politics       0.96      0.96      0.96      1174
       Sports       0.97      0.95      0.96      1097
         Tech       0.96      0.92      0.94      1062

     accuracy                           0.95      5610
    macro avg       0.95      0.95      0.95      5610
 weighted avg       0.95      0.95      0.95      5610



In [55]:
simple_test = ["Thousands of fans try to force their way into Dubai stadium, report sought"]
lst = [x.lower() for x in simple_test]
pred1 = model.predict(lst)
pred1[0]

'Sports'

In [56]:
simple_test2 = ["Five IPOs to hit mkt in first half of Nov; seek to raise over Rs 27,000 cr"]
lst2 = [x.lower() for x in simple_test2]
pred2 = model.predict(lst2)
pred2[0]

'Business'

In [57]:
simple_test3 = ["Anushka Sharma dresses daughter Vamika as a fairy for Halloween, Soha Ali Khan turns Inaaya into a unicorn"]
lst3 = [x.lower() for x in simple_test3]
pred3 = model.predict(lst3)
pred3[0]

'Entertainment'

In [58]:
simple_test4 = ["All iOS users can now Super Follow on Twitter"]
lst4 = [x.lower() for x in simple_test4]
pred4 = model.predict(lst4)
pred4[0]

'Tech'

In [59]:
simple_test5 = ["In Politics, Anything is Possible’: Partap Bajwa Says Congress Had Decided Only Sitting MLA will be Punjab CM"]
lst5 = [x.lower() for x in simple_test5]
pred5 = model.predict(lst5)
pred5[0]

'Politics'