In [1]:
import newsgroups
im
from NeuralNet import NeuralNet

# Load and preprocess data

In [2]:
data = newsgroups.load()

In [3]:
data

Unnamed: 0,text,group
0,morgan and guzman will have era's 1 run higher...,rec.sport.baseball
1,"Well, I just got my Centris 610 yesterday. It...",comp.sys.mac.hardware
2,History and classical methods. Modern methods....,sci.crypt
3,ATTENTION: Mac Quadra owners: Many storage ind...,comp.sys.mac.hardware
4,To show that the examples I and others have pr...,alt.atheism
...,...,...
18841,Why are circuit boards green? The material us...,sci.electronics
18842,bike. Luckily the guy stops a foot behind my ...,rec.motorcycles
18843,We were told that the resolution on the 5FGe c...,comp.sys.mac.hardware
18844,CAD Setup For Sale: G486PLB Local Bus Motherbo...,misc.forsale


# Split all data into train and test set

In [32]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=42, stratify=data.group)

#### Train Set:

In [33]:
train.shape

(15076, 2)

#### Test Set:

In [34]:
test.shape

(3770, 2)

#### Train-Set label distribution:

In [35]:
train.group.value_counts()

rec.sport.hockey            799
soc.religion.christian      798
rec.motorcycles             797
rec.sport.baseball          795
sci.crypt                   793
rec.autos                   792
sci.med                     792
sci.space                   790
comp.windows.x              790
comp.os.ms-windows.misc     788
sci.electronics             787
comp.sys.ibm.pc.hardware    786
misc.forsale                780
comp.graphics               778
comp.sys.mac.hardware       770
talk.politics.mideast       752
talk.politics.guns          728
alt.atheism                 639
talk.politics.misc          620
talk.religion.misc          502
Name: group, dtype: int64

#### Test-Set label distribution

In [8]:
test.group.value_counts()

rec.sport.hockey            200
rec.sport.baseball          199
rec.motorcycles             199
soc.religion.christian      199
comp.windows.x              198
rec.autos                   198
sci.crypt                   198
sci.med                     198
sci.electronics             197
sci.space                   197
comp.os.ms-windows.misc     197
comp.sys.ibm.pc.hardware    196
comp.graphics               195
misc.forsale                195
comp.sys.mac.hardware       193
talk.politics.mideast       188
talk.politics.guns          182
alt.atheism                 160
talk.politics.misc          155
talk.religion.misc          126
Name: group, dtype: int64

# Encode text using bag of words

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

cv = CountVectorizer(max_features=20).fit(train.text)

In [10]:
keys = sorted((key, token) for token, key in cv.vocabulary_.items())
keys

[(0, 'and'),
 (1, 'are'),
 (2, 'as'),
 (3, 'ax'),
 (4, 'be'),
 (5, 'email'),
 (6, 'for'),
 (7, 'have'),
 (8, 'in'),
 (9, 'is'),
 (10, 'it'),
 (11, 'not'),
 (12, 'of'),
 (13, 'on'),
 (14, 'that'),
 (15, 'the'),
 (16, 'this'),
 (17, 'to'),
 (18, 'with'),
 (19, 'you')]

In [42]:
samples = ['this is an email', 'be not afraid', 'may force be with you and you']
encoded = pd.DataFrame(data = cv.transform(samples).toarray(), columns = [token for key, token in keys])

# Encode text using TF-IDF

Consider a document containing 100 words wherein the word ’cat’ appears 3 times. Assume we have 10 million documents and the word ’cat’ appears in one thousand of these. Calculate TF-idf for the term ‘cat’
1. Tf = 3 / 100 = 0.03
2. Idf = log(10000000 / 1000) = 4.0
3. TfIdf = Tf * Idf = 0.03 * 4.0 = 0.12

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=20).fit(train.text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=20,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)


In [13]:
keys = sorted((key, token) for token, key in tfidf.vocabulary_.items())
keys

[(0, 'and'),
 (1, 'are'),
 (2, 'as'),
 (3, 'ax'),
 (4, 'be'),
 (5, 'email'),
 (6, 'for'),
 (7, 'have'),
 (8, 'in'),
 (9, 'is'),
 (10, 'it'),
 (11, 'not'),
 (12, 'of'),
 (13, 'on'),
 (14, 'that'),
 (15, 'the'),
 (16, 'this'),
 (17, 'to'),
 (18, 'with'),
 (19, 'you')]

In [14]:
samples = ['this is an email', 'be not afraid', 'may force be with you and you']
encoded = pd.DataFrame(data = tfidf.transform(samples).toarray(), columns = [token for key, token in keys])
encoded

Unnamed: 0,and,are,as,ax,be,email,for,have,in,is,it,not,of,on,that,the,this,to,with,you
0,0.0,0.0,0.0,0.0,0.0,0.580743,0.0,0.0,0.0,0.525736,0.0,0.0,0.0,0.0,0.0,0.0,0.621563,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.696696,0.0,0.0,0.0,0.0,0.0,0.0,0.717367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.297537,0.0,0.0,0.0,0.391901,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.396685,0.774936


#### Vocabulary

In [15]:
tfidf = TfidfVectorizer(max_features=10000).fit(train.text)
len(tfidf.vocabulary_)

10000

#### Encode train and test sets using same vocabulary

In [16]:
X_train = tfidf.transform(train.text).toarray()
print('X_train shape: %s' % (X_train.shape,))

X_test = tfidf.transform(test.text).toarray()
print('X_test shape: %s' % (X_test.shape,))

X_train shape: (15076, 10000)
X_test shape: (3770, 10000)


# Convert labels to integer values

#### Fit label encoder:

In [17]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder().fit(train.group)

#### Known labels:

In [18]:
label_encoder.classes_

array(['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
       'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
       'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles',
       'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
       'sci.electronics', 'sci.med', 'sci.space',
       'soc.religion.christian', 'talk.politics.guns',
       'talk.politics.mideast', 'talk.politics.misc',
       'talk.religion.misc'], dtype=object)

#### Convert labels using encoder:

In [19]:
Y_train = label_encoder.transform(train.group)
print('Y_train shape: %s' % (Y_train.shape,))

Y_test = label_encoder.transform(test.group)
print('Y_test shape: %s' % (Y_test.shape,))

Y_train shape: (15076,)
Y_test shape: (3770,)


# Naive Bayes

In [60]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train, Y_train)
# from NeuralNetwork import NeuralNet

# cls = NeuralNet((10, ), normalize=False, learning_rate=0.1, num_iter = 1)
# cls.fit(X_train, Y_train, epsilon=1e-15)


#### Train report:

In [61]:
from sklearn.metrics import classification_report

print(classification_report(Y_train, clf.predict(X_train), target_names=label_encoder.classes_))

                          precision    recall  f1-score   support

             alt.atheism       0.88      0.72      0.79       639
           comp.graphics       0.84      0.85      0.85       778
 comp.os.ms-windows.misc       0.83      0.86      0.85       788
comp.sys.ibm.pc.hardware       0.79      0.88      0.83       786
   comp.sys.mac.hardware       0.92      0.89      0.91       770
          comp.windows.x       0.92      0.88      0.90       790
            misc.forsale       0.90      0.80      0.85       780
               rec.autos       0.91      0.94      0.92       792
         rec.motorcycles       0.96      0.95      0.96       797
      rec.sport.baseball       0.96      0.97      0.97       795
        rec.sport.hockey       0.94      0.98      0.96       799
               sci.crypt       0.91      0.95      0.93       793
         sci.electronics       0.88      0.84      0.86       787
                 sci.med       0.94      0.95      0.95       792
         

#### Test report:

In [22]:
print(classification_report(Y_test, clf.predict(X_test), target_names=label_encoder.classes_))

                          precision    recall  f1-score   support

             alt.atheism       0.79      0.58      0.67       160
           comp.graphics       0.72      0.79      0.76       195
 comp.os.ms-windows.misc       0.77      0.77      0.77       197
comp.sys.ibm.pc.hardware       0.69      0.79      0.73       196
   comp.sys.mac.hardware       0.85      0.77      0.81       193
          comp.windows.x       0.84      0.82      0.83       198
            misc.forsale       0.89      0.75      0.81       195
               rec.autos       0.87      0.86      0.86       198
         rec.motorcycles       0.88      0.90      0.89       199
      rec.sport.baseball       0.95      0.95      0.95       199
        rec.sport.hockey       0.93      0.93      0.93       200
               sci.crypt       0.85      0.91      0.88       198
         sci.electronics       0.82      0.77      0.79       197
                 sci.med       0.94      0.88      0.91       198
         

In [23]:
from sklearn.metrics import confusion_matrix


cm = pd.DataFrame(data = confusion_matrix(Y_test, clf.predict(X_test)), 
                  index = label_encoder.classes_, 
                  columns= label_encoder.classes_)
cm

Unnamed: 0,alt.atheism,comp.graphics,comp.os.ms-windows.misc,comp.sys.ibm.pc.hardware,comp.sys.mac.hardware,comp.windows.x,misc.forsale,rec.autos,rec.motorcycles,rec.sport.baseball,rec.sport.hockey,sci.crypt,sci.electronics,sci.med,sci.space,soc.religion.christian,talk.politics.guns,talk.politics.mideast,talk.politics.misc,talk.religion.misc
alt.atheism,93,1,0,0,1,0,0,0,2,0,0,0,0,0,0,50,7,6,0,0
comp.graphics,0,155,7,7,2,10,1,1,2,0,2,4,1,1,1,1,0,0,0,0
comp.os.ms-windows.misc,1,9,151,18,2,9,1,1,1,0,0,2,1,0,1,0,0,0,0,0
comp.sys.ibm.pc.hardware,0,5,13,154,9,3,3,1,1,0,1,1,5,0,0,0,0,0,0,0
comp.sys.mac.hardware,0,3,6,18,148,1,2,1,0,1,1,3,8,0,0,1,0,0,0,0
comp.windows.x,0,16,3,7,0,163,0,1,0,0,1,2,1,0,3,1,0,0,0,0
misc.forsale,0,1,1,12,5,1,147,4,6,0,4,5,6,0,2,1,0,0,0,0
rec.autos,0,1,2,1,0,1,3,170,6,1,1,0,2,3,0,4,3,0,0,0
rec.motorcycles,0,3,0,0,0,0,1,7,180,0,0,0,1,0,2,3,2,0,0,0
rec.sport.baseball,0,0,2,0,0,0,1,0,1,190,3,0,0,0,0,0,1,0,1,0


In [62]:
plt.matshow(cm)
plt.colorbar()

NameError: name 'plt' is not defined

# Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression().fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


#### Train report:

In [25]:
print(classification_report(Y_train, clf.predict(X_train), target_names=label_encoder.classes_))

                          precision    recall  f1-score   support

             alt.atheism       0.92      0.91      0.92       639
           comp.graphics       0.88      0.92      0.90       778
 comp.os.ms-windows.misc       0.88      0.88      0.88       788
comp.sys.ibm.pc.hardware       0.87      0.87      0.87       786
   comp.sys.mac.hardware       0.94      0.92      0.93       770
          comp.windows.x       0.93      0.92      0.92       790
            misc.forsale       0.83      0.91      0.87       780
               rec.autos       0.94      0.94      0.94       792
         rec.motorcycles       0.97      0.96      0.97       797
      rec.sport.baseball       0.96      0.98      0.97       795
        rec.sport.hockey       0.99      0.98      0.99       799
               sci.crypt       0.98      0.95      0.96       793
         sci.electronics       0.90      0.92      0.91       787
                 sci.med       0.96      0.97      0.96       792
         

#### Test report

In [26]:
print(classification_report(Y_test, clf.predict(X_test), target_names=label_encoder.classes_))

                          precision    recall  f1-score   support

             alt.atheism       0.78      0.78      0.78       160
           comp.graphics       0.75      0.81      0.78       195
 comp.os.ms-windows.misc       0.82      0.78      0.80       197
comp.sys.ibm.pc.hardware       0.77      0.74      0.76       196
   comp.sys.mac.hardware       0.85      0.81      0.83       193
          comp.windows.x       0.83      0.84      0.83       198
            misc.forsale       0.75      0.85      0.80       195
               rec.autos       0.84      0.85      0.85       198
         rec.motorcycles       0.91      0.90      0.90       199
      rec.sport.baseball       0.93      0.94      0.94       199
        rec.sport.hockey       0.98      0.91      0.94       200
               sci.crypt       0.95      0.89      0.92       198
         sci.electronics       0.80      0.84      0.82       197
                 sci.med       0.89      0.88      0.89       198
         

# Support Vector Machines

https://en.wikipedia.org/wiki/Support-vector_machine#Linear_SVM

#### Hard margin

![image.png](images/svm.png)

$$ \huge \overrightarrow{w} \cdot \overrightarrow{x} - b = 1;$$
$$ \huge \overrightarrow{w} \cdot \overrightarrow{x} - b = -1;$$
$$ \huge distance = \frac{2}{||\overrightarrow{w}||};$$
$$ \huge y_i(\overrightarrow{w} \cdot \overrightarrow{x_i} - b) \geq 1, \forall i;$$

#### Soft margin

![image.png](images/svm_2.png)

$$ \huge hinge = max(0, 1 - y_i(\overrightarrow{w} \cdot \overrightarrow{x_i} - b));$$
$$ \huge loss = \frac{1}{n} \sum_{i=1}^{n} hinge + \lambda ||\overrightarrow{w}||^2;$$
$$ \huge loss = \frac{1}{n} \sum_{i=1}^{n} max(0, 1 - y_i(\overrightarrow{w} \cdot \overrightarrow{x_i} - b)) + \lambda ||\overrightarrow{w}||^2;$$

In [27]:
from sklearn.svm import LinearSVC

clf = LinearSVC().fit(X_train, Y_train)

#### Train report:

In [28]:
print(classification_report(Y_train, clf.predict(X_train), target_names=label_encoder.classes_))

                          precision    recall  f1-score   support

             alt.atheism       1.00      0.99      0.99       639
           comp.graphics       0.99      0.99      0.99       778
 comp.os.ms-windows.misc       0.98      0.98      0.98       788
comp.sys.ibm.pc.hardware       0.98      0.97      0.98       786
   comp.sys.mac.hardware       1.00      0.99      0.99       770
          comp.windows.x       0.99      0.99      0.99       790
            misc.forsale       0.95      0.99      0.97       780
               rec.autos       1.00      0.99      1.00       792
         rec.motorcycles       1.00      1.00      1.00       797
      rec.sport.baseball       1.00      1.00      1.00       795
        rec.sport.hockey       1.00      1.00      1.00       799
               sci.crypt       1.00      1.00      1.00       793
         sci.electronics       1.00      0.99      0.99       787
                 sci.med       1.00      0.99      1.00       792
         

#### Test report:

In [29]:
print(classification_report(Y_test, clf.predict(X_test), target_names=label_encoder.classes_))

                          precision    recall  f1-score   support

             alt.atheism       0.82      0.81      0.82       160
           comp.graphics       0.81      0.86      0.83       195
 comp.os.ms-windows.misc       0.81      0.85      0.83       197
comp.sys.ibm.pc.hardware       0.81      0.77      0.79       196
   comp.sys.mac.hardware       0.85      0.82      0.84       193
          comp.windows.x       0.86      0.86      0.86       198
            misc.forsale       0.80      0.88      0.84       195
               rec.autos       0.88      0.88      0.88       198
         rec.motorcycles       0.93      0.91      0.92       199
      rec.sport.baseball       0.95      0.95      0.95       199
        rec.sport.hockey       0.97      0.93      0.95       200
               sci.crypt       0.97      0.92      0.94       198
         sci.electronics       0.86      0.83      0.85       197
                 sci.med       0.91      0.90      0.91       198
         

In [30]:
import pickle

with open('svc.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [31]:
with open('svc.pkl', 'rb') as f:
    new_clf = pickle.load(f)
    x = tfidf.transform(['I hate photoshop'])
    y_encoded = new_clf.predict(x)
    y_labeled = label_encoder.inverse_transform(y_encoded) 
    print(y_labeled)

['comp.graphics']


Further reading https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html