In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [2]:
data = pd.read_csv('english-train.csv')

In [3]:
def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=stopwords.words('english')):
    
    ## clean (convert to lowercase and remove punctuations and   
    # characters and then strip)
    text = re.sub('[0-9]', ' ', text)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r"\b[a-zA-Z]\b", ' ', text)
    
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [4]:
data['headline'] = data.headline.apply(lambda x: 
          preprocess_text(x))

In [5]:
def count_length():
    data['word_count'] = data['headline'].apply(lambda x: len(str(x).split(" ")))

In [6]:
count_length()

In [7]:
data.head(10)

Unnamed: 0,headline,Label,word_count
0,former new zealand cricketer chris cairn diagn...,Sports,9
1,american skater nathan chen dazzle olympic return,Sports,7
2,la liga ene unal score brace lead getafes win ...,Sports,10
3,world cup australia beat afghanistan claim rd ...,Sports,8
4,icc world cup india v england final live strea...,Sports,10
5,brainsqueeze know snow,Sports,3
6,justin langer step australia coach,Sports,5
7,ratnakar shettys memoir apparently virat unhap...,Sports,9
8,think odi revolution happened world cup tendul...,Sports,10
9,world cup boxer son nishant sindhu deadly left,Sports,8


In [8]:
data.shape

(37395, 3)

In [9]:
data.replace('[^a-zA-Z]',' ',inplace=True)
for index in ['Label']:
    data[index]=data[index].str.lower()

In [10]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(data['headline'],data['Label'],test_size=0.3,random_state=42)

In [11]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
print(Train_Y[0:10])
print(Train_X[0:10])

[4 4 1 3 4 3 0 4 0 4]
10572    microsoft say israeli group sold tool hack window
10342    facebook twitter big tech see social commerce ...
37201           loki episode notice chris hemsworths cameo
7102     india paraathletics team tokyo paralympics pic...
9926     bird prey face global decline habitat loss poison
4788     real madrid join premier league la liga club r...
21720    nbfcs face liquidity pressure lack clarity rbi...
9805     apple app store change fail sway lawmaker bent...
21237    zomato asks staff start looking job firm fores...
7286               khaalijeb cofounder devastated reinvent
Name: headline, dtype: object


In [12]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(data['headline'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)



In [13]:
dt = RandomForestClassifier(max_depth = 1000)
dt.fit(Train_X_Tfidf,Train_Y)
predictions_dec = dt.predict(Test_X_Tfidf)

In [14]:
print("RFC Accuracy Score -> ",accuracy_score(predictions_dec, Test_Y)*100)

RFC Accuracy Score ->  89.33951332560835


In [15]:
mat = confusion_matrix(predictions_dec, Test_Y)
mat

array([[2056,   25,   72,   46,  121],
       [  20, 1936,   45,   80,   56],
       [ 103,   49, 2192,   82,   59],
       [  48,   85,   32, 1964,   56],
       [ 101,   56,   25,   35, 1875]], dtype=int64)

In [16]:
report = classification_report(predictions_dec, Test_Y)
print(report)

              precision    recall  f1-score   support

           0       0.88      0.89      0.88      2320
           1       0.90      0.91      0.90      2137
           2       0.93      0.88      0.90      2485
           3       0.89      0.90      0.89      2185
           4       0.87      0.90      0.88      2092

    accuracy                           0.89     11219
   macro avg       0.89      0.89      0.89     11219
weighted avg       0.89      0.89      0.89     11219



In [27]:
import joblib
import os
loaded_model = joblib.dump(RandomForestClassifier, 'eng_rfc.pkl')

In [17]:
simple_test = ["T20 World Cup: Team India 'unwinds' day off with beach volleyball"]
Train_X_Tfidf = Tfidf_vect.transform(simple_test)
pred1 = dt.predict(Train_X_Tfidf)
pred1[0]



3

In [18]:
predictions_test = Encoder.inverse_transform(pred1)
predictions_test[0]

'sports'

In [19]:
simple_test2 = ["Five IPOs to hit mkt in first half of Nov; seek to raise over Rs 27,000 cr"]
Train_X_Tfidf = Tfidf_vect.transform(simple_test2)
pred2 = dt.predict(Train_X_Tfidf)
pred2[0]

0

In [20]:
predictions_test = Encoder.inverse_transform(pred2)
predictions_test[0]

'business'

In [21]:
simple_test3 = ["Anushka Sharma dresses daughter Vamika as a fairy for Halloween, Soha Ali Khan turns Inaaya into a unicorn"]
Train_X_Tfidf = Tfidf_vect.transform(simple_test3)
pred3 = dt.predict(Train_X_Tfidf)
pred3[0]

1

In [22]:
predictions_test = Encoder.inverse_transform(pred3)
predictions_test[0]

'entertainment'

In [23]:
simple_test4 = ["All iOS users can now Super Follow on Twitter"]
Train_X_Tfidf = Tfidf_vect.transform(simple_test4)
pred4 = dt.predict(Train_X_Tfidf)
pred4[0]

4

In [24]:
predictions_test = Encoder.inverse_transform(pred4)
predictions_test[0]

'tech'

In [32]:
simple_test5 = ["Partap Bajwa Says Congress Had Decided Only Sitting MLA will be Punjab CM"]
Train_X_Tfidf = Tfidf_vect.transform(simple_test5)
pred5 = dt.predict(Train_X_Tfidf)
pred5[0]

2

In [33]:
predictions_test = Encoder.inverse_transform(pred5)
predictions_test[0]

'politics'