`pip install pandas scikit-learn seaborn matplotlib nltk`


In [2]:
import pandas as pd

In [3]:
df = pd.read_json('../input/news-category-dataset/News_Category_Dataset_v3.json', lines=True)

In [5]:
df['category'] = df['category'].astype('category')


In [6]:
df['headline'] = df['headline'].dropna()

In [7]:
df['short_description'] = df['short_description'].dropna()

In [8]:
df['full_text'] = (df['headline'] + ' ' + df['short_description'])

In [9]:
df['full_text'] = [e.lower() for e in df['headline']]

In [10]:
import nltk

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')
nltk.download('punkt_tab')

!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cnt

In [22]:
from nltk.tokenize import word_tokenize

df['tokenized'] = [word_tokenize(e) for e in df['full_text']]

In [12]:
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords

tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

stopword_list = stopwords.words('english')

In [23]:
from nltk import pos_tag, WordNetLemmatizer, word_tokenize

lemmatizer = WordNetLemmatizer()
def process_text(tokens):
    processed = [
        lemmatizer.lemmatize(word, tag_map.get(tag[0], 'n'))
        for word, tag in pos_tag(tokens)
        if word.isalpha() and word not in stopword_list
    ]
    return ' '.join(processed)

df['processed_text'] = df['tokenized'].apply(process_text)


In [24]:
from sklearn.model_selection import train_test_split

x = df['processed_text']
y = df['category']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [25]:
from sklearn.preprocessing import LabelEncoder

y_train_encoded = LabelEncoder().fit_transform(y_train)
y_test_encoded = LabelEncoder().fit_transform(y_test)

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
train_x_tfidf = tfidf_vect.fit_transform(x_train)


In [27]:
from sklearn import naive_bayes

naive = naive_bayes.MultinomialNB()
naive.fit(train_x_tfidf, y_train_encoded)

In [28]:
test_x_tfidf = tfidf_vect.transform(x_test)
predictions = naive.predict(test_x_tfidf)

In [None]:
from sklearn import svm

svm_model = svm.SVC(kernel='linear')
svm_model.fit(train_x_tfidf, y_train_encoded)

In [None]:
import joblib
joblib.dump(svm_model, '/kaggle/working/svm_model.pkl')

In [None]:
predictions = svm_model.predict(test_x_tfidf)

In [29]:
from sklearn.metrics import confusion_matrix, f1_score

category_names = pd.unique(y_train).sort_values()
mat = confusion_matrix(y_test_encoded, predictions)
f1_score_result = f1_score(y_test_encoded, predictions, average='micro')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

print(f1_score_result)

print(mat)

plt.rcParams.update({'font.size': 8, 'figure.figsize': [15, 15]})
sns.set(font_scale=0.8)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, xticklabels=category_names, yticklabels=category_names)
plt.xlabel('true label')
plt.ylabel('predicted label')

0.41094354030449104
[[ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  9 ...  0  0  0]
 ...
 [ 0  0  0 ... 34  0  0]
 [ 0  0  0 ...  0  8  0]
 [ 0  0  0 ...  0  2  0]]


Text(159.5, 0.5, 'predicted label')