# The mission:
#### Multi classification of news posts into categories (politics, wellness, travels etc).
#### we will use a news data set, TF-IDF vectorization, and naive bayes model.

In [4]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import nltk
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import feature_extraction, model_selection, pipeline, preprocessing
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import label_binarize

# from imblearn.over_sampling import SMOTE
# from imblearn.pipeline import Pipeline


In [5]:
np.random.seed(42)
sns.set_theme(style="darkgrid")

**Loading the json file**

In [6]:
lst_dics = []
with open('/kaggle/input/news-category-dataset/News_Category_Dataset_v2.json', mode='r', errors='ignore') as json_file:
    for dic in json_file:
        lst_dics.append( json.loads(dic) )
lst_dics[0]

**Create pandas df and observe it:**

In [7]:
## create dtf
df = pd.DataFrame(lst_dics)
df.sample(10)

In [8]:
df.shape

In [9]:
df.info()

**we dont interested in all the columns, just in the description and headline, to create our NLP model.
we will concat them into a one column.
our target is the "category" column:**

In [10]:
df = df[['category','headline','short_description']]
df['text'] = df['headline'] +" "+ df['short_description']
df = df[['category','text']]
df.sample(5)

**we will lower all letters in the df:**

In [12]:
df = df.apply(lambda x: x.str.lower().str.strip() if isinstance(x, object) else x)
df.sample(5)

In [13]:
df.category.nunique()

**we have 41 categories in the data frame. lets visualize them:**

In [15]:
def bar_plot_cat(df):
    x = pd.DataFrame(df['category'].value_counts()).reset_index()
    x.rename(columns={'index':'category','category':"count"},inplace=True)
    fig, ax = plt.subplots(1,1,figsize=(14,6),dpi=(200))
    sns.barplot(data=x, x= 'category',y = 'count', color ='g');
    plt.xticks(rotation = 60);

bar_plot_cat(df)

**we will do this project only with 6 biggest categories.**        
**we see that we have unbalanced data.**



In [16]:
df_reduced = df[df['category'].isin(['politics','wellness','entertainment','travel','style & beauty','parenting'])].reset_index(drop=True)

In [17]:
df_reduced.shape

In [18]:
bar_plot_cat(df_reduced)

**a function to pre-process the text:**

In [20]:
stopwords = nltk.corpus.stopwords.words("english")

def utils_preprocess_text(text, flg_stemm=True, flg_lemm=True, lst_stopwords= stopwords):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [21]:
%%time
df_reduced["text_clean"] = df_reduced["text"].apply(lambda x: utils_preprocess_text(x))

In [22]:
df_reduced.head()


In [23]:
for i in range(3):
    print('before:',df_reduced['text'].iloc[i])
    print('after:',df_reduced['text_clean'].iloc[i])
    print("------------------------------------------------------------------------------------------")

**## Models**

**TF-IDF WITH NAIVE BAYES CLASSIFIER**

TF-IDF is a short for term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.
![definition](https://miro.medium.com/max/1400/1*NzTKUS0puSpmopsQzt6HRw.png)
![image](https://miro.medium.com/max/1400/1*HZvxT29V9B4HxT2wx8M4XQ.png)
the images are from [here.](https://medium.com/codex/document-indexing-using-tf-idf-189afd04a9fc)

In [24]:
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X = df_reduced['text_clean']
y = df_reduced['category']
le =  LabelEncoder()
y = le.fit_transform(y)
classes = le.classes_
classes

In [25]:
X.shape, y.shape

In [26]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=42)

In [27]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

**we will use a kind of naive bayes classifier to model our data**
naive bayes is commonly applied to text classification.
we actually use bayes theorm, to apply classification tasks, by this formula:

![naive bayes](https://vlsi.eelabs.technion.ac.il/wp-content/uploads/sites/18/2021/07/NBC.jpg)

In [29]:
## pipeline
classifier = MultinomialNB()
# we could add SMOTE to treat the imbalanced data - i saw it doesnt make here a difference.
# smt = SMOTE(random_state=42)
# model = Pipeline([("vectorizer", vectorizer),('sm',smt),("classifier", classifier)])

model = Pipeline([("vectorizer", vectorizer),("classifier", classifier)])
## train classifier
model.fit(X_train, y_train)

In [30]:
# we could see here the words count:
# dic_vocabulary = vectorizer.vocabulary_
# dic_vocabulary

In [31]:
y_pred = model.predict(X_test)
predicted_prob = model.predict_proba(X_test)

In [36]:
accuracy = metrics.accuracy_score(y_test, y_pred)
auc = metrics.roc_auc_score(y_test, predicted_prob, 
                            multi_class="ovr")
f1 = metrics.f1_score(y_test, y_pred,average='micro')
print("Accuracy:",  round(accuracy,2))
print("Auc:", round(auc,2))
print("Micro f1:",round(f1,2))
print("Detail:")
print(metrics.classification_report(y_test, y_pred))
    

* **Precision is a measure of the ability of a classification model to identify only the relevant datapoints, while recall is a measure of the ability of a model to find all the relevant cases within a dataset. the f1 is the harmonic mean of both.**
* **i think that because we deal with posts which could be ambivalent, these outcomes are good even it isnt a very complicated model**
* **the macro avg is the avg of all the "precisions" for example, is to calculate the precision to every class, and then get the avg.** 
* **the micro avg for precision is to add all the TP and divide it by (all TP + all FP) like calculate precision to all the classes together.** 
* **in a multi-class classification setup, micro-average is preferable if you suspect there might be class imbalance (like in our case).**
* **we have micro f1 of 0.86, and it's good for our task**



In [33]:

## Plot confusion matrix
fig, ax = plt.subplots(3,1,figsize=(14,27))

cm = metrics.confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", ax=ax[0], cmap="YlGnBu", 
            cbar=False);
ax[0].set(xlabel="Pred", ylabel="True", xticklabels=classes, 
       yticklabels=classes, title="Confusion matrix")
# plt.yticks(rotation=0);
ax[0].set_yticklabels(ax[0].get_xticklabels(),rotation = 0)
ax[0].set_title("Confusion matrix by units:")



cm = metrics.confusion_matrix(y_test, y_pred,normalize='pred')
sns.heatmap(cm, annot=True, fmt=".1%", ax=ax[1], cmap="RdBu", 
            cbar=False);
ax[1].set(xlabel="Pred", ylabel="True", xticklabels=classes, 
       yticklabels=classes, title="Confusion matrix")
# plt.yticks(rotation=0);
ax[1].set_yticklabels(ax[0].get_xticklabels(),rotation = 0)
ax[1].set_title("Confusion matrix by normalize by predictions (columns):")

cm = metrics.confusion_matrix(y_test, y_pred,normalize='true')
sns.heatmap(cm, annot=True, fmt=".1%", ax=ax[2], cmap="RdBu", 
            cbar=False);
ax[2].set(xlabel="Pred", ylabel="True", xticklabels=classes, 
       yticklabels=classes, title="Confusion matrix")
# plt.yticks(rotation=0);
ax[2].set_yticklabels(ax[1].get_xticklabels(),rotation = 0)
ax[2].set_title("Confusion matrix normalize by true values (rows):")

# Plot roc
# the functions get the form of "label binarize" - [0,0,1,0,0], so we first 
# transform y_test and y_pred to this shape:
y_test_bin = label_binarize(y_test, classes = le.transform(classes))
y_pred_bin = label_binarize(y_pred, classes = le.transform(classes))

fig, ax = plt.subplots(nrows=1, ncols=2,figsize=(14,6))
for i in range(len(classes)):
    fpr, tpr, thresholds = metrics.roc_curve(y_test_bin[:,i],  
                           predicted_prob[:,i])
    ax[0].plot(fpr, tpr, lw=3, 
              label='{0} (area={1:0.2f})'.format(classes[i], 
                              metrics.auc(fpr, tpr))
               )
ax[0].plot([0,1], [0,1], color='navy', lw=3, linestyle='--')
ax[0].set(xlim=[-0.05,1.0], ylim=[0.0,1.05], 
          xlabel='False Positive Rate', 
          ylabel="True Positive Rate (Recall)", 
          title="Receiver operating characteristic")
ax[0].legend(loc="lower right")
ax[0].grid(True)
    
## Plot precision-recall curve
for i in range(len(classes)):
    precision, recall, thresholds = metrics.precision_recall_curve(
                 y_test_bin[:,i], predicted_prob[:,i])
    ax[1].plot(recall, precision, lw=3, 
               label='{0} (area={1:0.2f})'.format(classes[i], 
                                  metrics.auc(recall, precision))
              )
ax[1].set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall', 
          ylabel="Precision", title="Precision-Recall curve")
ax[1].legend(loc="best")
ax[1].grid(True)




# Conclusion
* **the general accuracy and the micro f1 are both 0.86, it's good result for this mission (because we deal with text that could belong to more than one class, and the iimplications of mistakes in this classification are not serious.**
* **the cons of the method we have used, TF-IDF vector, is that every word gets a value that not in her context. if we want more accurate and more complex model, we should consider models like BERT or Word2Vec. but as we said, for posts calassification, 0.86 accuracy and microf1 are good result.**



