# Supervised + Unsupervised to Classify Labels of Newspapers

In [None]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import random

In [3]:
newsgroups = fetch_20newsgroups(subset = 'all', remove = ('headers','footers','quotes'))

In [4]:
data = pd.DataFrame()
data['text'] = newsgroups.data
data['labels'] = newsgroups.target
label_names = newsgroups.target_names
#IMPORT DATA INTO DATAFRAME


# Data Cleaning

In [5]:
#Make all text lowercase
data['cleaned'] = data['text'].str.lower()

In [6]:
import re
#Remove non alphabetic characters from data
data['cleaned'] = data['cleaned'].str.replace('\W+' , ' ')
data['cleaned'] = data['cleaned'].str.replace('\d+' , ' ')
                                              

In [7]:
import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
stop = stopwords.words('english')
#Remove stopwords, and download them if you need to

stop = set(stop)
data['cleaned'].apply(lambda x: ' '.join([item for item in x if item not in stop]))

0              u r e   e   b h e r   f   p e n   f n   ...
1          b r h e r     n   h e   r k e   f r     h g ...
2          f n l l   u     w h   u   r e   b u   e e r ...
3          h n k       h e   c   c r   n g   h e     r ...
4              h v e   n   l   j n e   r v e   w h c h ...
                               ...                        
18841    n   f r   n e   c n v x   u w e c   e u   v   ...
18842      n   n   l e   g r u n   r e c e p c l e   u ...
18843      j u   n l l e     x         c p u   n     c ...
18844      w u l n     h   r e q u r e     h p e r   p ...
18845    f e r     p   f r   g r   c r u   c r u   f c ...
Name: cleaned, Length: 18846, dtype: object

# Feature Creation

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(max_features = 5000)
x_counts = count_vect.fit_transform(data['cleaned'])
#Create count vectorizer from data
#x_counts.shape
#x_counts[0].toarray()


# Model Creation

about 80% train and 20% test.
u-fold cross validation


In [11]:
from sklearn.model_selection import train_test_split
#Split data into training and test datasets
x_train, x_test, y_train, y_test = train_test_split(x_counts, data['labels'], train_size = 0.8, test_size = 0.2)

In [12]:
x_train

<15076x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 1096143 stored elements in Compressed Sparse Row format>

In [13]:
from sklearn.svm import LinearSVC
clf = LinearSVC(max_iter = 2000)
clf.fit(x_train, y_train)
#Implement Linear SVM



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=2000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [14]:
predicted = clf.predict(x_test)
np.mean(predicted == y_test)

0.60026525198939

In [15]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train)
x_test_tfidf = tfidf_transformer.transform(x_test)

#Create TF-idf transformer from text

In [16]:
from sklearn.svm import LinearSVC
clf = LinearSVC(max_iter = 2000)
clf.fit(x_train, y_train)
#Implement Linear SVM for TF-IDF

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=2000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [17]:
predicted = clf.predict(x_test_tfidf)
np.mean(predicted == y_test)

0.6087533156498673

# Model Evaluation

In [5]:
from sklearn import metrics
print(metrics.classification)
#Find metrics of your model

# K-Means

In [None]:
from sklearn.cluster import KMeans
#Create K-means clustering
km = KMeans(n_clusters = 20)
km.fit(x_train_tfidf)