In [1]:
# In this task, we are going to use BBCNews dataset. There are 1490 articles from 5 topics, including tech, business, sport, entertainment, politics.

#     Task 1: Please use KNN and logistic regression to do classification, and compare their performance.

#     Task 2: Please use K-means to partition this dataset into 5 clusters and find the representative words in each cluster.


In [2]:
# 2.1 Load data and represent it with TF-IDF representation

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans

df = pd.read_csv('BBC_News_Train.csv')

df_train, df_test = train_test_split(df, test_size=0.15)

print("Train data target names: {}".format(df_train['Category'].unique()))

print('Training samples: {}'.format(len(df_train)))
print('Testing samples: {}'.format(len(df_test)))


tfidf = TfidfVectorizer(stop_words='english')
df_train_vectors = tfidf.fit_transform(df_train['Text'])
df_test_vectors = tfidf.transform(df_test['Text']) 

print(df_train_vectors.shape, df_test_vectors.shape)

Train data target names: ['politics' 'business' 'tech' 'entertainment' 'sport']
Training samples: 1266
Testing samples: 224
(1266, 22758) (224, 22758)


In [3]:
# 2.2 Use KNN to do document classification

Xtrain = df_train_vectors
Ytrain = df_train['Category']

Xtest = df_test_vectors
Ytest = df_test['Category']

k_range = range(1, 5)
param_grid = dict(n_neighbors=k_range)

clf_knn =  KNeighborsClassifier()

grid = GridSearchCV(clf_knn, param_grid, scoring='accuracy')
grid.fit(Xtrain, Ytrain)

print(grid.best_score_)
print(grid.best_params_)



0.9328654570352619
{'n_neighbors': 4}


In [6]:
# 2.3 Use Logistic Regression to do document classification

coeff = range(1, 10)
param_grid = dict(C=coeff)

clf_lr = LogisticRegression(penalty='l2')

grid = GridSearchCV(clf_lr, param_grid, scoring='accuracy')
grid.fit(Xtrain, Ytrain)

print(grid.best_params_)


clf_lr = LogisticRegression(penalty='l2', C=grid.best_params_['C'])
clf_lr.fit(Xtrain, Ytrain)

y_pred = clf_lr.predict(Xtest)

acc = accuracy_score(Ytest, y_pred)
macro_f1 = f1_score(Ytest, y_pred, average='macro')
micro_f1 = f1_score(Ytest, y_pred, average='micro')

print('Accuracy: ' + str(acc) + ' | Macro F1: ' +  str(macro_f1) + ' | Micro F1: ' + str(micro_f1))

{'C': 6}
Accuracy: 0.9821428571428571 | Macro F1: 0.9813431108060262 | Micro F1: 0.9821428571428571


In [8]:
# 2.4 Use K-means to do document clustering and find the 10 most representative words in each cluster.

clf_kmeans = KMeans(n_clusters=5)
y = clf_kmeans.fit(Xtrain)
terms = tfidf.get_feature_names()
order_centroids = clf_kmeans.cluster_centers_.argsort()[:, ::-1]
for i in range(5):
     print("Cluster %d:" % i)
     for ind in order_centroids[i, :10]:
         print(' %s' % terms[ind])

Cluster 0:
 said
 mobile
 people
 music
 software
 users
 digital
 technology
 microsoft
 net
Cluster 1:
 mr
 labour
 said
 blair
 election
 party
 government
 brown
 minister
 prime
Cluster 2:
 said
 growth
 economy
 year
 mr
 company
 market
 bank
 sales
 economic
Cluster 3:
 england
 game
 said
 win
 match
 chelsea
 team
 cup
 players
 season
Cluster 4:
 film
 best
 awards
 band
 award
 actor
 said
 album
 star
 festival


