In [3]:
import spacy
import warnings
warnings.filterwarnings('ignore')

In [4]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups

In [5]:
from sklearn.pipeline import Pipeline

In [6]:
train = fetch_20newsgroups(subset='train', shuffle=True, download_if_missing=True)
test = fetch_20newsgroups(subset='test', shuffle=True, download_if_missing=True)

In [7]:
print(train.description)

the 20 newsgroups by date dataset


In [8]:
train.data[0]
print(len(train['data']))

11314


In [10]:
#Looking at the classes
train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [11]:
#Extracting features from text files
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_train_counts = cv.fit_transform(train.data)
print(f'Shape of Term Document Matrix: {X_train_counts.shape}')

Shape of Term Document Matrix: (11314, 130107)


In [12]:
from sklearn.feature_extraction.text import  TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(f'Shape of TfIdf Matrix: {X_train_tfidf.shape}')

Shape of TfIdf Matrix: (11314, 130107)


# Algorithmic Implementations

In [13]:
#Training Naive bayes Classifer on training data
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, train.target)


In [14]:
#Building a Pipeline
text_nb_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                       ('clf', MultinomialNB())])
text_nb_clf = text_nb_clf.fit(train.data, train.target)

In [15]:
#Prediction on test data
predicted = text_nb_clf.predict(test.data)
accuracy = np.mean(predicted == test.target) * 100
print(f'Test accuracy is {accuracy} %')

Test accuracy is 77.38980350504514 %


In [20]:
from sklearn.linear_model import LogisticRegression as LR


In [22]:
text_lr_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf',LR())])
text_lr_clf = text_lr_clf.fit(train.data, train.target)

In [26]:
lr_predicted = text_lr_clf.predict(test.data)
lr_accuracy = np.mean(lr_predicted == test.target) * 100
print(f'Test accuracy is {lr_accuracy} %')

Test accuracy is 82.79341476367499 %


In [28]:
from sklearn import svm
from sklearn.linear_model import SGDClassifier


In [30]:
text_svm_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('svm-clf',SGDClassifier())])
text_svm_clf = text_svm_clf.fit(train.data, train.target)

In [31]:
svm_predicted = text_svm_clf.predict(test.data)
svm_accuracy = np.mean(svm_predicted == test.target) * 100
print(f'Test accuracy is {svm_accuracy} %')

Test accuracy is 85.15666489644185 %
