In [1]:
#The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics 
#split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
#Names Corpus	Kantrowitz, Ross	8k male and female names
from nltk.corpus import names
# Returns the input word unchanged if it cannot be found in WordNet.
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('names')
nltk.download('wordnet')

[nltk_data] Downloading package names to C:\Users\Ramesh
[nltk_data]     Narayanan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\names.zip.
[nltk_data] Downloading package wordnet to C:\Users\Ramesh
[nltk_data]     Narayanan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
all_names = set(names.words())
lemmatizer = WordNetLemmatizer()

def letters_only(astr):
    for c in astr:
        if not c.isalpha():
            return False
    return True



#Creating a dictionary to append only words without names
def clean_text(docs):
    cleaned_docs = []
    for doc in docs:
        cleaned_docs.append(' '.join([lemmatizer.lemmatize(word.lower())
                                        for word in doc.split()
                                        if letters_only(word)
                                        and word not in all_names]))
    return cleaned_docs

In [3]:
#Multiclass classifier
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
    'rec.sport.hockey'
]
data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [7]:
#Clean the text data and retreive labels
cleaned_train = clean_text(data_train.data)
label_train = data_train.target
cleaned_test = clean_text(data_test.data)
label_test = data_test.target

In [8]:
#Extract tf-idf features using Tfidf vectorizer
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1)
term_docs_train = vectorizer.fit_transform(cleaned_train)#Learn vocabulary and idf, return term-document matrix.
term_docs_test = vectorizer.transform(cleaned_test)#Transform documents to document-term matrix.
svm = SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(term_docs_train, label_train)
accuracy = svm.score(term_docs_test, label_test)
print('The accuracy on testing set is: {0:.1f}%'.format(accuracy*100))

The accuracy on testing set is: 86.8%


In [9]:
#Check how it performs for individual classes
from sklearn.metrics import classification_report
prediction = svm.predict(term_docs_test)
report = classification_report(label_test, prediction)
print(report)

             precision    recall  f1-score   support

          0       0.76      0.78      0.77       319
          1       0.89      0.94      0.91       389
          2       0.98      0.95      0.97       399
          3       0.91      0.90      0.91       394
          4       0.73      0.69      0.71       251

avg / total       0.87      0.87      0.87      1752



In [10]:
# Binary classfication
categories = ['comp.graphics', 'sci.space']
data_train = fetch_20newsgroups(subset='train', categories=categories, random_state=42)
data_test = fetch_20newsgroups(subset='test', categories=categories, random_state=42)

In [11]:
cleaned_train = clean_text(data_train.data)
label_train = data_train.target
cleaned_test = clean_text(data_test.data)
label_test = data_test.target


In [12]:
#Check whether the classes are imbalanced
from collections import Counter
Counter(label_train)

Counter({0: 584, 1: 593})

In [13]:
#Extract tf-idf features using Tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', max_features=8000)
term_docs_train = tfidf_vectorizer.fit_transform(cleaned_train)
term_docs_test = tfidf_vectorizer.transform(cleaned_test)

In [14]:
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=1.0, random_state=42)
svm.fit(term_docs_train, label_train)
accuracy = svm.score(term_docs_test, label_test)
print('The accuracy on testing set is: {0:.1f}%'.format(accuracy*100))

The accuracy on testing set is: 96.4%


In [15]:
#Check how it performs for individual classes
from sklearn.metrics import classification_report
prediction = svm.predict(term_docs_test)
report = classification_report(label_test, prediction)
print(report)

             precision    recall  f1-score   support

          0       0.97      0.96      0.96       389
          1       0.96      0.97      0.96       394

avg / total       0.96      0.96      0.96       783

