In [None]:
import pickle
import re
from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model  import PassiveAggressiveClassifier
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import warnings
warnings.filterwarnings("ignore")


# The Task: News Group Classification

Given documents in different news groups (i.e topics):
```
['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']```
 
Train a classifier to predict the topic of a given document. 


## Step 1: Preprocessing the data
To start off, we're going to load the data from sklearn and do some simple preprocessing. We'll remove the headers, footers and quotes in the articles. We'll also do the same preprocessing as before.


The sanity check the data, we'll look at a few examples.


In [None]:

categories = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

full_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)

def preprocess_data(data):
    processed_data = []
    for indx, sample in enumerate(data['data']):
        text, label = sample, data['target'][indx]
        label_name = data['target_names'][label]
        text = re.sub('\W+', ' ', text).lower().strip()
        processed_data.append( (text, label, label_name) )
    return processed_data


full_train_set = preprocess_data(full_train)
train_set = full_train_set[:-5000]
dev_set = full_train_set[-5000:]
test_set = preprocess_data(test)



In [None]:
print("Num Train: {}".format(len(train_set)))
print("Num Dev: {}".format(len(dev_set)))
print("Num Test: {}".format(len(test_set)))
print("Example Documents:")
print(train_set[0])
print()
print(train_set[1])


## Step 2: Feature Engineering 

How do we represent a document? This is up to you!
Remeber, you can vary the vocabulary size, choose to put ``ngrams``!

Remember, we can do this very easily with ```sklearn.feature_extraction.text.CountVectorizer```

<img src="vectorizer.png">


In [None]:
#Extract tweets and labels into 2 lists
trainText = [t[0] for t in train_set]
trainY = [t[1] for t in train_set]

devText = [t[0] for t in dev_set]
devY = [t[1] for t in dev_set]


testText = [t[0] for t in test_set]
testY = [t[1] for t in test_set]


countVec = ## Intialize your count vectorizer with your arguments


# Learn vocabulary from train set
countVec.something()

# Transform list of review to matrix of bag-of-word vectors
trainX = 
devX = 
testX = 

In [None]:
print("Shape of Train X {}\n".format(trainX.shape))
print("Sample of the vocab:\n {}".format(np.random.choice(countVec.get_feature_names(), 20)))


## Step 3: Pick a model and experiment

Explore various models.

I recomment exploring:
1) [Logistic Regression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)
2) [SVM](http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC)

And look around the [library](http://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm) for other options!


Remeber, you can explore regulazation strategies as well and change things like `C`



In [None]:
# Initialize your model
model = YourModelClass()



In [None]:
model.fit(trainX, trainY)


print("Train Accuracy:", model.score(trainX, trainY))
print("Dev Accuracy:", model.score(devX, devY))
print("--")

## Step 4: Analysis, Debugging the Model
To understand how to make the model better, it's important understand what the model is learning, and what it's getting wrong.

Recall how we did this for Logistic regression, and feel free to look back at the earlier file if you get stuck. 

It can be helpful inspect the highest weighted features of the model and look at some examples the model got wrong on the development set. 

From what you learn, you can go back and change the preprocessing, the feature extraction or play with the model. As you make changes, go back to this section.


In [None]:
print("Intepreting The model")
for label in range(20):
    coefs = model.coef_[label]
    vocab = np.array(countVec.get_feature_names())
    num_features = 5

    top = np.argpartition(coefs, -num_features)[-num_features:]
    # Sort top
    top = top[np.argsort(coefs[top])]
    s_coef = coefs[top]
    scored_vocab = list(zip(vocab[top], s_coef))
    print("Top weighted features for label {}:\n \n {}\n -- \n".format(test_data['target_names'][label], scored_vocab))

In [None]:
## Find erronous dev errors
devPred = lr.predict(devX)
errors = []
for indx in range(len(devText)):
    if devPred[indx] != devY[indx]:
        error = "Review: \n {} \n Predicted: {} \n Correct: {} \n ---".format(
            devText[indx],
            devPred[indx],
            devY[indx])
        errors.append(error)

np.random.seed(1)
print("Random dev error: \n {} \n \n {} \n \n{}".format(
        np.random.choice(errors,1),
        np.random.choice(errors,1),
        np.random.choice(errors,1))
     )

In [None]:
## Step 5: Take best model, and report results on Test

In [None]:
print("Test Accuracy:", model.score(testX, testY))
