In [1]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

In [2]:
newsgroups =  fetch_20newsgroups(subset='all', remove = ('headers', 'footers', 'quotes')) #LOAD DATASET

In [4]:
data = pd.DataFrame()
data['text'] = newsgroups.data
data['labels'] = newsgroups.target
label_names = newsgroups.target_names
data

#IMPORT DATA INTO DATAFRAME

Unnamed: 0,text,labels
0,\n\nI am sure some bashers of Pens fans are pr...,10
1,My brother is in the market for a high-perform...,3
2,\n\n\n\n\tFinally you said what you dream abou...,17
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3
4,1) I have an old Jasmine drive which I cann...,4
...,...,...
18841,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,13
18842,\nNot in isolated ground recepticles (usually ...,12
18843,I just installed a DX2-66 CPU in a clone mothe...,3
18844,\nWouldn't this require a hyper-sphere. In 3-...,1


# Data Cleaning

In [5]:
#Make all text lowercase
data['cleaned'] = data['text'].str.lower()
data.shape

(18846, 3)

In [6]:
import re
#Remove non alphabetic characters from data
data['cleaned'] = data['cleaned'].str.replace('\W+', ' ')
data['cleaned'] = data['cleaned'].str.replace('\d+', ' ')
data

Unnamed: 0,text,labels,cleaned
0,\n\nI am sure some bashers of Pens fans are pr...,10,i am sure some bashers of pens fans are prett...
1,My brother is in the market for a high-perform...,3,my brother is in the market for a high perform...
2,\n\n\n\n\tFinally you said what you dream abou...,17,finally you said what you dream about mediter...
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3,think it s the scsi card doing the dma transf...
4,1) I have an old Jasmine drive which I cann...,4,i have an old jasmine drive which i cannot u...
...,...,...,...
18841,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,13,dn from nyeda cnsvax uwec edu david nye dn a n...
18842,\nNot in isolated ground recepticles (usually ...,12,not in isolated ground recepticles usually an...
18843,I just installed a DX2-66 CPU in a clone mothe...,3,i just installed a dx cpu in a clone mother...
18844,\nWouldn't this require a hyper-sphere. In 3-...,1,wouldn t this require a hyper sphere in spa...


In [7]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')

stop = set(stop)
data['cleaned'] = data['cleaned'].apply(lambda x:' '.join([item for item in x.split() if item not in stop]))
len(stop)
#Remove stopwords, and download them if you need to

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yuktivijay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


179

In [8]:
data

Unnamed: 0,text,labels,cleaned
0,\n\nI am sure some bashers of Pens fans are pr...,10,sure bashers pens fans pretty confused lack ki...
1,My brother is in the market for a high-perform...,3,brother market high performance video card sup...
2,\n\n\n\n\tFinally you said what you dream abou...,17,finally said dream mediterranean new area grea...
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3,think scsi card dma transfers disks scsi card ...
4,1) I have an old Jasmine drive which I cann...,4,old jasmine drive cannot use new system unders...
...,...,...,...
18841,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,13,dn nyeda cnsvax uwec edu david nye dn neurolog...
18842,\nNot in isolated ground recepticles (usually ...,12,isolated ground recepticles usually unusual co...
18843,I just installed a DX2-66 CPU in a clone mothe...,3,installed dx cpu clone motherboard tried mount...
18844,\nWouldn't this require a hyper-sphere. In 3-...,1,require hyper sphere space points specifies sp...


# Feature Creation

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
#Create count vectorizer from data
count_vect = CountVectorizer(max_features = 5000)
x_counts = count_vect.fit_transform(data['cleaned'])


In [10]:
x_counts

<18846x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 896492 stored elements in Compressed Sparse Row format>

# Model Creation

In [11]:
from sklearn.model_selection import train_test_split
#Split data into training and test datasets
x_train, x_test, y_train, y_test = train_test_split(x_counts, data['labels'], train_size = 0.8, test_size = 0.2)


In [12]:
x_train

<15076x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 717063 stored elements in Compressed Sparse Row format>

In [13]:
from sklearn.svm import LinearSVC
clf = LinearSVC(max_iter = 5000)
clf.fit(x_train, y_train)

#Implement Linear SVM



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=5000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [14]:
predicted = clf.predict(x_test)
np.mean(predicted == y_test)

0.6161803713527851

In [15]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train)
x_test_tfidf = tfidf_transformer.transform(x_test)


In [16]:
x_train_tfidf

<15076x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 717063 stored elements in Compressed Sparse Row format>

In [17]:
from sklearn.svm import LinearSVC
#Implement Linear SVM for TF-IDF
clf = LinearSVC(max_iter = 5000)
clf.fit(x_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=5000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [18]:
predicted = clf.predict(x_test_tfidf)
np.mean(predicted == y_test)

0.6193633952254642

In [19]:
x_test_tfidf

<3770x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 179429 stored elements in Compressed Sparse Row format>

# Model Evaluation

In [20]:
from sklearn import metrics
#Find metrics of your model
print(metrics.classification_report(y_test, predicted, target_names = label_names))

                          precision    recall  f1-score   support

             alt.atheism       0.59      0.45      0.51       165
           comp.graphics       0.61      0.51      0.56       202
 comp.os.ms-windows.misc       0.58      0.58      0.58       205
comp.sys.ibm.pc.hardware       0.61      0.54      0.57       189
   comp.sys.mac.hardware       0.60      0.61      0.61       201
          comp.windows.x       0.75      0.75      0.75       203
            misc.forsale       0.73      0.63      0.68       195
               rec.autos       0.36      0.71      0.48       194
         rec.motorcycles       0.61      0.68      0.64       195
      rec.sport.baseball       0.71      0.74      0.72       186
        rec.sport.hockey       0.83      0.78      0.81       197
               sci.crypt       0.76      0.69      0.72       215
         sci.electronics       0.58      0.48      0.52       199
                 sci.med       0.68      0.74      0.71       187
         

# K-Means

In [None]:
from sklearn.cluster import KMeans
#Create K-means clustering
km = KMeans(n_clusters = 20)
km.fit(x_train_tfidf)