## Project:- Microsoft News Classification using NLTK

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df=pd.read_csv('news.tsv',header=None,sep='\t')

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [4]:
import copy

In [5]:
df.columns=['News ID',
"Category",
"SubCategory",
"Title",
"Abstract",
"URL",
"Title Entities",
"Abstract Entities "]
# Copying the dataset
df=copy.deepcopy(df)

In [6]:
df.head()

Unnamed: 0,News ID,Category,SubCategory,Title,Abstract,URL,Title Entities,Abstract Entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51282 entries, 0 to 51281
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   News ID             51282 non-null  object
 1   Category            51282 non-null  object
 2   SubCategory         51282 non-null  object
 3   Title               51282 non-null  object
 4   Abstract            48616 non-null  object
 5   URL                 51282 non-null  object
 6   Title Entities      51279 non-null  object
 7   Abstract Entities   51278 non-null  object
dtypes: object(8)
memory usage: 3.1+ MB


### Any missing values?

In [8]:
df.isnull().sum()

News ID                  0
Category                 0
SubCategory              0
Title                    0
Abstract              2666
URL                      0
Title Entities           3
Abstract Entities        4
dtype: int64

In [9]:
df.isnull().sum()/len(df)*100

News ID               0.000000
Category              0.000000
SubCategory           0.000000
Title                 0.000000
Abstract              5.198705
URL                   0.000000
Title Entities        0.005850
Abstract Entities     0.007800
dtype: float64

In [10]:
df.dropna(inplace=True)

In [11]:
df['Category'].value_counts()

news             15203
sports           13230
finance           3047
foodanddrink      2513
lifestyle         2318
travel            2223
video             2064
weather           1879
health            1834
autos             1502
tv                 868
music              754
movies             602
entertainment      556
kids                16
middleeast           2
northamerica         1
Name: Category, dtype: int64

In [12]:
#df['categorty']

In [13]:
df.head()['Title'][1]

'50 Worst Habits For Belly Fat'

### Text Preprocessing

In [14]:
import string


In [15]:
from nltk.corpus import stopwords

In [16]:
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [221]:
new=df['Title'][:5]
new

0    The Brands Queen Elizabeth, Prince Charles, an...
1                        50 Worst Habits For Belly Fat
2    The Cost of Trump's Aid Freeze in the Trenches...
3    I Was An NBA Wife. Here's How It Affected My M...
4    How to Get Rid of Skin Tags, According to a De...
Name: Title, dtype: object

In [222]:
def text_process(mess):
    nopunc=[char for char in mess if char not in string.punctuation]
    nopunc=''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
     

In [223]:
df['Title'].head(5).apply(text_process)

0    [Brands, Queen, Elizabeth, Prince, Charles, Pr...
1                      [50, Worst, Habits, Belly, Fat]
2    [Cost, Trumps, Aid, Freeze, Trenches, Ukraines...
3         [NBA, Wife, Heres, Affected, Mental, Health]
4     [Get, Rid, Skin, Tags, According, Dermatologist]
Name: Title, dtype: object

In [224]:
len(df['Title'].head(5).apply(text_process)[0])

8

In [225]:
df['Title'][2]

"The Cost of Trump's Aid Freeze in the Trenches of Ukraine's War"

In [193]:
#df['Title']=df['Title'][:2000].apply(text_process)

In [123]:
print(len(df['Title'][0]))

70


### Vectorization

In [226]:
from sklearn.feature_extraction.text import CountVectorizer

In [227]:
bow_transformer=CountVectorizer(analyzer=text_process).fit(df['Title'])

In [228]:
print(len(bow_transformer.vocabulary_))

45710


In [229]:
text4=df['Title'][3]
text4

"I Was An NBA Wife. Here's How It Affected My Mental Health."

In [231]:
bow4=bow_transformer.transform([text4])
print(bow4)

  (0, 2409)	1
  (0, 12808)	1
  (0, 12936)	1
  (0, 17276)	1
  (0, 18136)	1
  (0, 28298)	1


now let's see its vector representation

In [232]:
print(bow4.shape)

(1, 45710)


In [233]:
print(bow_transformer.get_feature_names()[28298])

Wife


In [234]:
#now apply forwhole dataframe

text_bow=bow_transformer.transform(df['Title'])
print(text_bow.shape)


(48612, 45710)


### now TF-IDF

In [238]:
from sklearn.feature_extraction.text import TfidfTransformer

In [239]:
tfidf_transformer=TfidfTransformer().fit(text_bow)
tfidf4=tfidf_transformer.transform(bow4)
print(tfidf4)

  (0, 28298)	0.41312047715482697
  (0, 18136)	0.34908149053060955
  (0, 17276)	0.471287705245047
  (0, 12936)	0.2920974645887958
  (0, 12808)	0.36825936608419807
  (0, 2409)	0.5142232916556925


In [240]:
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['wife']])

7.460912923545194


In [241]:
#now apply to the whole datafram
text_tfidf=tfidf_transformer.transform(text_bow)
print(text_tfidf.shape)

(48612, 45710)


In [242]:
df['Category'].value_counts()

news             15203
sports           13230
finance           3047
foodanddrink      2513
lifestyle         2318
travel            2223
video             2064
weather           1879
health            1834
autos             1502
tv                 868
music              754
movies             602
entertainment      556
kids                16
middleeast           2
northamerica         1
Name: Category, dtype: int64

### Model Training

In [243]:
from sklearn.linear_model import LogisticRegression

In [244]:

model=LogisticRegression()
model.fit(text_tfidf,df['Category'])


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [245]:
all_predictions=model.predict(text_tfidf)

In [246]:
print(classification_report(all_predictions,df['Category']))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

        autos       0.61      0.95      0.74       957
entertainment       0.40      0.95      0.56       233
      finance       0.67      0.89      0.76      2285
 foodanddrink       0.82      0.90      0.86      2288
       health       0.69      0.93      0.79      1363
         kids       0.00      0.00      0.00         0
    lifestyle       0.60      0.84      0.70      1652
   middleeast       0.00      0.00      0.00         0
       movies       0.45      0.95      0.61       283
        music       0.53      0.94      0.68       426
         news       0.97      0.70      0.81     21041
 northamerica       0.00      0.00      0.00         0
       sports       0.98      0.90      0.94     14285
       travel       0.52      0.86      0.65      1337
           tv       0.50      0.90      0.64       482
        video       0.23      0.92      0.36       508
      weather       0.68      0.87      0.77      1472

     acc

  _warn_prf(average, modifier, msg_start, len(result))


### Train test split

In [None]:
X=

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

#### creating Pipeline

In [249]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

In [253]:
pipeline = Pipeline([
    ('Tf-Idf', CountVectorizer(analyzer=text_process)),
    ('classifier', LogisticRegression(solver='newton-cg', multi_class='multinomial'))
])

In [254]:
X1 = df['Title']
y1 = df['Category']

In [259]:
review_train1, review_test1, label_train1, label_test1 = train_test_split(X1, y1, test_size=0.3, random_state=101)

In [260]:
pipeline.fit(review_train1, label_train1)


Pipeline(steps=[('Tf-Idf',
                 CountVectorizer(analyzer=<function text_process at 0x000001A8E2CFD160>)),
                ('classifier',
                 LogisticRegression(multi_class='multinomial',
                                    solver='newton-cg'))])

In [262]:
from sklearn import metrics

In [263]:
pip_pred1 = pipeline.predict(review_test1)
print(metrics.classification_report(label_test1,pip_pred1,zero_division=1))

               precision    recall  f1-score   support

        autos       0.73      0.49      0.59       459
entertainment       0.67      0.32      0.44       173
      finance       0.61      0.51      0.55       907
 foodanddrink       0.75      0.69      0.72       757
       health       0.73      0.56      0.63       578
         kids       1.00      0.00      0.00         6
    lifestyle       0.56      0.45      0.50       695
   middleeast       1.00      0.00      0.00         2
       movies       0.78      0.45      0.57       172
        music       0.81      0.46      0.59       230
         news       0.64      0.84      0.73      4515
       sports       0.87      0.93      0.90      3971
       travel       0.50      0.34      0.40       692
           tv       0.65      0.33      0.44       247
        video       0.31      0.14      0.20       601
      weather       0.73      0.59      0.65       579

     accuracy                           0.71     14584
    macr

In [264]:
pipeline.score(review_train1,label_train1)

0.968790407899377