# Classification and Clustering of Text articles

## Setup

In [1]:
# importing Libraries

import pandas as pd
import numpy as np
import string
import re
import nltk
import matplotlib.pyplot as plt
from tabulate import tabulate
from scipy.stats import t
from collections import defaultdict

In [2]:
train_data = pd.read_json('data/train.json')[:10000]
validation_data = pd.read_json('data/validation.json')

In [3]:
train_data.head()

Unnamed: 0,body,category,title
0,"Every day, cubicle-dwellers get up from their ...",4,MobileAccess Networks Strengthens Signals for ...
1,New 1.8-inch hard drives may boost battery lif...,4,Hitachi Drives Consumer Storage
2,A hearing into allegations of racism against t...,1,Cricket: Zim race probe halted
3,The prospect that a tropical storm and a hurri...,4,Simultaneous Tropical Storms are Very Rare
4,Second seed Jiri Novak and number three Guille...,2,NOVAK AND CANAS SET UP SHOWDOWN


In [4]:
validation_data.head()

Unnamed: 0,category,body,title
0,4,The first targeted flyby of Titan occurs on Tu...,Titan flyby overview
1,1,Officials can not estimate all casualties as s...,Dubai terminal construction collapse
2,4,A patch has been issued for the JpegOfDeath ho...,Will JpegOfDeath Help Slay Microsoft?
3,3,"Marsh amp; McLennan Companies Inc., the insur...",Update 2: Marsh Seeks Incentive Fees for Settl...
4,4,A Singaporean IT products distributor is intro...,PC distributor puts RFID tags in goods


In [7]:
def text_to_words(text):
    replace_punctuation = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    return text.translate(replace_punctuation).lower().split()

In [8]:
# Get all of terms in data set

def get_vocabulary(data):
    terms = set()
    for doc_idx in data.index:
        body_words = text_to_words(data.loc[doc_idx].body)
        title_words = text_to_words(data.loc[doc_idx].title)
        terms.update(body_words + title_words)
    return list(terms)

In [9]:
# Building static tf-idf dicts

def build_tf(data, terms): 
    n = data.shape[0]
    v = len(terms)
    weight_matrix = np.zeros((n, v))
    
    for doc_idx in data.index:
        doc = data.loc[doc_idx]
        body_words = text_to_words(doc.body)
        title_words = text_to_words(doc.title)
        weight_matrix[doc_idx] = np.zeros(v)
        for w in set(body_words + title_words):
            if w in terms:
                weight_matrix[doc_idx][terms.index(w)] = body_words.count(w) + title_words.count(w)
    
    return pd.DataFrame(weight_matrix, index=data.index, columns=terms)

In [10]:
terms = get_vocabulary(train_data)

In [None]:
# mat = build_tf(train_data[:100], terms)

In [None]:
# mat['target_category'] = train_data[:100]['category']

In [None]:
# mat

In [None]:
# classes = mat.groupby(['target_category']).sum()

In [None]:
# classes.sum(axis=1)

In [None]:
x_train = build_tf(train_data, terms)

In [None]:
x_val = build_tf(validation_data, terms)

In [None]:
x_train

In [None]:
x_val

In [None]:
x_train.astype(bool).sum(axis=0)

In [None]:
# document frequency per term
df = x_train.astype(bool).sum(axis=0)
df = np.log10(train_data.shape[0] / df)

In [None]:
df

In [None]:
x_train = df * x_train
y_train = train_data['category']

x_val = df * x_val
y_val = validation_data['category']

## Evaluation Functions

In [None]:
def confusion_matrix(y_true, y_pred, categories):
    conf_mat = np.zeros((len(categories), len(categories)))
    for i in range(len(categories)):
        for j in range(len(categories)):
            conf_mat[i][j] = np.count_nonzero((y_pred == i) & (y_true == j).to_numpy())    
    return pd.DataFrame(conf_mat, index=['Predicted ' + cat for cat in categories], columns=['True ' + cat for cat in categories])

In [None]:
def accuracy(y_true, y_pred):
    diff = (y_true == y_pred).to_numpy()
    return np.count_nonzero(diff) / len(y_true)

In [None]:
def precision_recall(y_true, y_pred, categories):
    conf_mat = confusion_matrix(y_true, y_pred)
    recall = np.zeros(len(categories))
    precision = np.zeros(len(categories))
    
    for i in range(len(categories)):
        precision[i] = conf_mat[i][i] / conf_mat.sum(axis=0)[i]
        recall[i] = conf_mat[i][i] / conf_mat.sum(axis=1)[i]
        
    return recall, precision

## Classification

### Naive-Bayes


#### Class Implementation


In [None]:
class Naive_Bayes:
  def __init__(self, alpha):
      self.alpha = alpha
    
  def predict(self, x_test):
      test_pred = pd.DataFrame(columns = ['category'], index = x_test.index)
      for doc_idx in x_test.index:
        print(doc_idx)
        doc = x_test.loc[doc_idx]
        terms = list(set(text_to_words(doc.body) + text_to_words(doc.title)))         
        scores = pd.DataFrame(index = self.class_probs.index)        
        for c in self.class_probs.index:
            scores.loc[c] = self.class_probs.loc[c] + self.term_probs[terms].sum(axis=1).loc[c]                      
      return test_pred

  def fit(self, tf_mat):
      tf_mat.loc[:, tf_mat.columns != 'target_category'] = tf_mat.loc[:, tf_mat.columns != 'target_category'] + self.alpha  
      self.class_probs = np.log10(tf_mat['target_category'].value_counts() / tf_mat['target_category'].shape[0]) 
      self.term_probs = np.log10(tf_mat.groupby(['target_category']).sum())
      print(self.term_probs.head())
      self.term_probs = self.term_probs.sub(pd.Series(np.log10(tf_mat.groupby(['target_category']).sum().sum(axis=1)).tolist(), index=self.term_probs.index))
      print(self.term_probs.head())

#### Evaluation


In [None]:
nb_model = Naive_Bayes(alpha = 0.1)
nb_model.fit(mat)

In [None]:
mat

In [None]:
y_pred = nb_model.predict(validation_data)

### K Nearest Neighbor


#### Class Implementation


In [None]:
class KNN:
  def __init__(self, k, method):
      self.k = k
      self.method = method
    

  def predict(self, x_test):
      test_pred = pd.DataFrame(columns = ['category'], index = x_test.index)
      dist_matrix = self.distance_matrix(x_test)
      
      for i in range(0, len(x_test)):
          neighbor_indexes = dist_matrix[i, :].argsort()[0:self.k] 
          majority = self.y.iloc[neighbor_indexes].mode()
          test_pred.at[x_test.index[i], 'category'] = majority.at[0]

      return test_pred


  def distance_matrix(self, x_test):
      if self.method == 'c':
          return np.dot(self.x / np.linalg.norm(self.x), (x_test / np.linalg.norm(x_test)).T)

      if self.method == 'e':
          dist_matrix = np.zeros((x_test.shape[0], self.x.shape[0]))
          dist_matrix = - 2 * np.dot(x_test, self.x.T).T
          dist_matrix += np.diag((np.dot(x_test, x_test.T)))
          dist_matrix = dist_matrix.T
          dist_matrix += np.diag((np.dot(self.x, self.x.T)))
          return np.sqrt(dist_matrix)


  def fit(self, x, y):
      self.x = x
      self.y = y   

#### Evaluation


In [None]:
knn_model = KNN(k=3, method='e')
knn_model.fit(x_val, y_val)
y_pred = knn_model.predict(x_val)

In [None]:
accuracy(y_val, y_pred)

In [None]:
y_val

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_val)

### Preprocessing Effect


In [None]:
from nltk.corpus import stopwords

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [t for t in tokens if not t in stop_words]

In [None]:
from nltk.stem import WordNetLemmatizer

def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(t) for t in tokens]

In [None]:
from nltk.stem import PorterStemmer

def stem_text(tokens):
    stemmer = PorterStemmer() 
    return [stemmer.stem(t) for t in tokens]

In [None]:
tokens = remove_stopwords(text_to_words(train_data.loc[0].body))

In [None]:
lemmatize_text(tokens)

### SVM


In [None]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

# Hyper-parameter tuning with GridSearchCV

params = {'C': [1, 10]}
svm_model = svm.SVC(kernel='linear')
svm_model = GridSearchCV(estimator = svm_model, param_grid = params, cv = 5)

svm_model.fit(x_train, y_train)
print("Validation accuracy = ", svm_model.score(x_val, y_val))
print("Best params = ", svm_model.best_params_)

### Random Forest


In [None]:
from sklearn.ensemble import RandomForestClassifier

n_estimators = [100, 200, 300]
max_depth = [5, 10, 15]

params = {'n_estimators': n_estimators, 
          'max_depth': max_depth,}

rf_m### Random Forest
odel = RandomForestClassifier()
rf_model = GridSearchCV(estimator = rf_model, param_grid = params, cv = 5)
rf_model.fit(x_train, y_train)

print("Validation accuracy = ", rf_model.score(x_val, y_val))
print("Best params = ", rf_model.best_params_)

## Clustering

### K-means

In [None]:
class K_Means:
    
  def __init__(self, k, threshold = 0.01, max_iter = 100):
      self.k = k
      self.threshold = threshold
      self.max_iter = max_iter
    


  def fit(self, x, y):
      self.x = x
      self.y = y   

### Evaluation

In [None]:
km_model = K_Means(k=3)
km_model.fit(x_train)

### t-SNE

In [6]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2)
tsne_results = tsne.fit_transform(data_subset)

(4, 2)

In [None]:
df_subset['tsne-2d-one'] = tsne_results[:,0]
df_subset['tsne-2d-two'] = tsne_results[:,1]
plt.figure(figsize=(16,10))
sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="y",
    palette=sns.color_palette("hls", 10),
    data=df_subset,
    legend="full",
    alpha=0.3
)

## Word2Vec

In [None]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(window=2, size=300)