# Clustering News
### This dataset has been downloaded from https://www.kaggle.com/rmisra/news-category-dataset.
### Our goal is to categorize news articles based on their headlines and short descriptions

In [1]:
# Import of libs

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import pandas as pd
from nltk import pos_tag
from nltk.stem import PorterStemmer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
#Naive Bayes
from sklearn.naive_bayes import MultinomialNB
# K-Means
from sklearn.cluster import KMeans
#confusion Matrix
from sklearn.metrics import confusion_matrix
# Score
from sklearn.metrics import classification_report

In [3]:
#Load the dataset
news = pd.read_csv('/media/sf_FormacaoCientistaDeDados/Portfolio/News_Category/news-category-dataset/news_dataset.csv')

In [4]:
news.head()

Unnamed: 0.1,Unnamed: 0,authors,category,date,headline,link,short_description,text
0,11,,WORLD NEWS,2018-05-26,South Korean President Meets North Korea's Kim...,https://www.huffingtonpost.com/entry/south-kor...,The two met to pave the way for a summit betwe...,South Korean President Meets North Korea's Kim...
1,20,David Moye,WEIRD NEWS,2018-05-26,Weird Father's Day Gifts Your Dad Doesn't Know...,https://www.huffingtonpost.com/entry/weird-fat...,Why buy a boring tie when you can give him tes...,Weird Father's Day Gifts Your Dad Doesn't Know...
2,22,Hilary Hanson,WEIRD NEWS,2018-05-26,Mystery 'Wolf-Like' Animal Reportedly Shot In ...,https://www.huffingtonpost.com/entry/montana-w...,“We have no idea what this was until we get a ...,Mystery 'Wolf-Like' Animal Reportedly Shot In ...
3,23,"Josh Smith and Christine Kim, Reuters",WORLD NEWS,2018-05-25,North Korea Still Open To Talks After Trump Ca...,https://www.huffingtonpost.com/entry/north-kor...,Trump’s announcement came after repeated threa...,North Korea Still Open To Talks After Trump Ca...
4,24,,WORLD NEWS,2018-05-25,2 Men Detonate Bomb Inside Indian Restaurant N...,https://www.huffingtonpost.com/entry/mississau...,"Fifteen people were taken to the hospital, thr...",2 Men Detonate Bomb Inside Indian Restaurant N...


In [5]:
#Checking missing values, columns and rowcount
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21450 entries, 0 to 21449
Data columns (total 8 columns):
Unnamed: 0           21450 non-null int64
authors              18296 non-null object
category             21450 non-null object
date                 21450 non-null object
headline             21448 non-null object
link                 21450 non-null object
short_description    17926 non-null object
text                 21450 non-null object
dtypes: int64(1), object(7)
memory usage: 1.3+ MB


In [6]:
news.groupby(['category']).count()

Unnamed: 0_level_0,Unnamed: 0,authors,date,headline,link,short_description,text
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GREEN,2622,2063,2622,2622,2622,2046,2622
MEDIA,2815,2424,2815,2814,2815,2275,2815
RELIGION,2556,2167,2556,2555,2556,1858,2556
SCIENCE,2178,1706,2178,2178,2178,1775,2178
STYLE,2254,2126,2254,2254,2254,1567,2254
TASTE,2096,2040,2096,2096,2096,1940,2096
TECH,2082,1660,2082,2082,2082,2081,2082
WEIRD NEWS,2670,2284,2670,2670,2670,2209,2670
WORLD NEWS,2177,1826,2177,2177,2177,2175,2177


In [8]:
# rowcount
news.shape

(21450, 8)

## Preprocessing
#### We'll to turn "text" feature into word vectors

In [10]:
# Function for preprocessing
def preprocessing(text):
    #removing the punctuation
    text_v2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())
    #tokenizes sentences into words
    tokens = [word for sent in nltk.sent_tokenize(text_v2) for word in nltk.word_tokenize(sent)]
    #lowercase
    tokens =[word.lower() for word in tokens]
    #Removing stopwords
    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]
    #Removing words lenght <  3
    tokens = [token for token in tokens if len(tokens) >=3 ]
    #Stemming ("eating" into "eat")
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    #POS tagging
    tagged_corpus = pos_tag(tokens)
    
    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']
    lemmatizer = WordNetLemmatizer()
    
    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')
    
    
    pre_proc_text = ' '.join([prat_lemmatize(token,tag)for token ,tag in tagged_corpus])
    
    return pre_proc_text
    
    

In [12]:
# Applies the preprocessing function
news_2 = []
for r in news['text']:
    news_2.append(preprocessing(r))
    

In [13]:
#split train and test dataset
x_train, x_test, y_train, y_test = train_test_split(news_2, news['category'])

In [14]:
#building TFIDF Vectorizer
vectorizer = TfidfVectorizer(min_df=3 ,stop_words ='english',strip_accents='unicode',ngram_range=(1,3))#,norm='l1')
x_train_2 = vectorizer.fit_transform(x_train).todense()

## Naive Bayes

In [15]:
# Training the model
nb = MultinomialNB().fit(x_train_2,y_train)
#Predict
nb_predicted = nb.predict(x_train_2)


In [16]:
# Evalueate test dataset 
x_test_2 = vectorizer.transform(x_test).todense() 
nb_predicted_test = nb.predict(x_test_2)

In [17]:
#confusion_matrix 
pd.crosstab(y_test, nb_predicted_test,rownames =["Actuall"],colnames = ["Predicted"])

Predicted,GREEN,MEDIA,RELIGION,SCIENCE,STYLE,TASTE,TECH,WEIRD NEWS,WORLD NEWS
Actuall,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GREEN,530,14,8,14,2,4,0,65,16
MEDIA,17,592,23,3,7,2,22,23,14
RELIGION,20,38,536,7,6,3,4,21,16
SCIENCE,85,21,17,379,2,20,6,42,4
STYLE,13,20,15,3,446,20,12,36,3
TASTE,15,16,7,5,13,408,10,41,6
TECH,14,46,6,10,9,5,407,24,12
WEIRD NEWS,44,28,13,22,19,25,10,438,15
WORLD NEWS,41,34,51,3,7,1,7,18,382


In [18]:
# score on test dataset
print(classification_report(y_test, nb_predicted_test))

             precision    recall  f1-score   support

      GREEN       0.68      0.81      0.74       653
      MEDIA       0.73      0.84      0.78       703
   RELIGION       0.79      0.82      0.81       651
    SCIENCE       0.85      0.66      0.74       576
      STYLE       0.87      0.79      0.83       568
      TASTE       0.84      0.78      0.81       521
       TECH       0.85      0.76      0.81       533
 WEIRD NEWS       0.62      0.71      0.66       614
 WORLD NEWS       0.82      0.70      0.75       544

avg / total       0.78      0.77      0.77      5363



## K-Means

In [None]:
# K-means
km = KMeans(n_clusters = 9,init= 'random',n_init=1,verbose = 1)
km.fit(x_train_2)

In [41]:
km.labels_

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [43]:
km.labels_.shape

(7243,)

In [44]:
km.cluster_centers_

array([[  1.10199847e-04,   2.55529676e-03,   1.67927139e-04, ...,
          2.41521866e-04,   9.00642152e-04,   5.82826854e-05],
       [ -5.14996032e-19,   1.64798730e-17,   1.62630326e-19, ...,
         -5.96311195e-19,   1.57353824e-03,  -8.13151629e-19],
       [  2.98155597e-19,  -2.16840434e-18,   0.00000000e+00, ...,
         -4.33680869e-19,  -8.67361738e-19,   1.35525272e-19],
       [  2.39945655e-04,   4.27605676e-03,   7.42743450e-04, ...,
          9.67587611e-04,   3.03043404e-04,   5.94420561e-04]])

In [33]:
import matplotlib.pyplot as plt

In [45]:
print(pd.crosstab(y_train, km.labels_,rownames = ["Actuall"],colnames = ["Predicted"]) )

Predicted      0    1   2    3
Actuall                       
RELIGION    1650   38  99  115
SCIENCE      801   56   0  759
STYLE       1341  235   0  127
WEIRD NEWS  1830   69   0  123


In [38]:
x_test_2

matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

##### Books: 
##### STATISTICS_FOR_MACHINE_LEARNING<br>BUILDING MACHINE LEARNING SYSTEMS WITH PYTHON