Binary Classification

In [25]:
from sklearn.datasets import fetch_20newsgroups

categories = ['comp.graphics', 'sci.space']
data_train = fetch_20newsgroups(subset = 'train', categories = categories, random_state = 42)
data_test = fetch_20newsgroups(subset = 'test', categories = categories, random_state = 42)

In [26]:
import nltk

nltk.download('names')
nltk.download('wordnet')

[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [36]:
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer

all_names = set(names.words())
lemmatizer = WordNetLemmatizer()

def clean_text(docs):
  cleaned_docs = []
  for doc in docs :
    lemmatized_list = [lemmatizer.lemmatize(word.lower()) for word in doc.split() if word.isalpha() and word not in all_names]
    cleaned_docs.append(' '.join(lemmatized_list))
  return cleaned_docs

In [37]:
cleaned_train = clean_text(data_train.data)
label_train = data_train.target
cleaned_test = clean_text(data_test.data)
label_test = data_test.target

len(label_train), len(label_test)

(1177, 783)

In [29]:
from collections import Counter
print(Counter(label_train))
print(Counter(label_test))

Counter({1: 593, 0: 584})
Counter({1: 394, 0: 389})


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidv = TfidfVectorizer(stop_words = 'english', max_features = 8000)
term_docs_train = tfidv.fit_transform(cleaned_train)
term_docs_test = tfidv.transform(cleaned_test)

In [39]:
from sklearn.svm import SVC
svm = SVC(kernel = 'linear', C = 1.0, random_state = 42)
svm.fit(term_docs_train, label_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [40]:
accuracy = svm.score(term_docs_test, label_test)
print("The accuracy on testing set is ", accuracy*100)

The accuracy on testing set is  95.91315453384419


Multiple Classification

In [41]:
categories = ['comp.graphics', 'sci.space']
data_train = fetch_20newsgroups(subset = 'train', categories = categories, random_state = 42)
data_test = fetch_20newsgroups(subset = 'test', categories = categories, random_state = 42)

In [42]:
cleaned_train = clean_text(data_train.data)
label_train = data_train.target

cleaned_test = clean_text(data_test.data)
label_test = data_test.target

len(label_train), len(label_test)

(1177, 783)

In [43]:
tfidf_vectorizer = TfidfVectorizer(sublinear_tf = True, max_df = 0.5, stop_words='english', max_features=8000)
term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train)
term_docs_test = tfidf_vectorizer.transform(cleaned_test)

In [44]:
svm = SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(term_docs_train, label_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [45]:
accuracy = svm.score(term_docs_test, label_test)
print("The accuracy on testing set is ", accuracy*100)

The accuracy on testing set is  96.42401021711366


In [46]:
from sklearn.metrics import classification_report

prediction = svm.predict(term_docs_test)
report = classification_report(label_test, prediction)
print(report)

              precision    recall  f1-score   support

           0       0.97      0.96      0.96       389
           1       0.96      0.97      0.96       394

    accuracy                           0.96       783
   macro avg       0.96      0.96      0.96       783
weighted avg       0.96      0.96      0.96       783

