## Review: what we did in Week 3: ML Assignment 1 data
* Read data files (csv and tsv)
* Get to know the data
* Create a smaller subset of the data
## [Jump to Week 4 material](#thisWeek)

In [1]:
# imports and specifications
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### read Amazon.csv

In [2]:
toxic_data = pd.read_csv('../final_data/toxiccomments_train.csv')
movie_data = pd.read_csv('../final_data/moviereviews_train.tsv', sep='\t')

### toxic data

In [3]:
print(type(toxic_data), "\n")
print(toxic_data.shape, "\n")
print(toxic_data.dtypes, "\n")
print(toxic_data.head(5))

<class 'pandas.core.frame.DataFrame'> 

(159571, 8) 

id               object
comment_text     object
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object 

                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0    

### movie review data

In [4]:
print(type(movie_data), "\n")
print(movie_data.shape, "\n")
print(movie_data.dtypes, "\n")
print(movie_data.head(5))

<class 'pandas.core.frame.DataFrame'> 

(25000, 3) 

id           object
sentiment     int64
review       object
dtype: object 

       id  sentiment                                             review
0  5814_8          1  With all this stuff going down at the moment w...
1  2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
2  7759_3          0  The film starts with a manager (Nicholas Bell)...
3  3630_4          0  It must be assumed that those who praised this...
4  9495_8          1  Superbly trashy and wondrously unpretentious 8...


### create a ndarray for `L` for `movie_data`

In [5]:
L = movie_data["sentiment"]
print(type(L))
print(type(L.values))
print(L.shape)

<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>
(25000,)


### create a ndarray for `X`
Use only "word_count" as a feature, for now.

In [6]:
movie_data['word_count'] = movie_data['review'].str.split(' ').str.len()
movie_data['punc_count'] = movie_data['review'].str.count("\.")
print(movie_data.head(), "\n")

X = movie_data[['word_count', 'punc_count']]
print(type(X))
print(type(X.values))
print(X.shape)

       id  sentiment                                             review  \
0  5814_8          1  With all this stuff going down at the moment w...   
1  2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...   
2  7759_3          0  The film starts with a manager (Nicholas Bell)...   
3  3630_4          0  It must be assumed that those who praised this...   
4  9495_8          1  Superbly trashy and wondrously unpretentious 8...   

   word_count  punc_count  
0         433          20  
1         158          16  
2         378          20  
3         379           8  
4         367           9   

<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>
(25000, 2)


## <a name='thisWeek'></a>Week 4: fit linear classifier using gradient descent and assess the fit of the model

### using the `SGDClassifier` class in `linear_model`, fit the model according to given training data

In [7]:
from sklearn import linear_model
sgd = linear_model.SGDClassifier(loss="squared_loss")
sgd.fit(X, L)



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='squared_loss',
       max_iter=None, n_iter=None, n_iter_no_change=5, n_jobs=None,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       tol=None, validation_fraction=0.1, verbose=0, warm_start=False)

In [8]:
# number & proportion of accurate predictions
print(sum(sgd.predict(X) == L.values))
print(sum(sgd.predict(X) == L.values) / L.shape)

12500
[0.5]


### how well did we do? compare the model's predictions for  `Y` to the labels `L`
We'll start with the first few measures in Flach, p. 57

In [9]:
import my_measures

sgd_pm = my_measures.BinaryClassificationPerformance(sgd.predict(X), L, 'sgd')
sgd_pm.compute_measures()
print(sgd_pm.performance_measures)

{'Pos': 12500, 'Neg': 12500, 'TP': 12500, 'TN': 0, 'FP': 12500, 'FN': 0, 'Accuracy': 0.5, 'Precision': 0.5, 'Recall': 1.0, 'desc': 'sgd'}


## Normalization

*[Normalization](https://scikit-learn.org/stable/modules/preprocessing.html#normalization) is the process of scaling individual samples to have unit norm.*

In [11]:
X.describe()

Unnamed: 0,word_count,punc_count
count,25000.0,25000.0
mean,233.78624,13.08768
std,173.745845,9.811129
min,10.0,0.0
25%,127.0,7.0
50%,174.0,10.0
75%,284.0,16.0
max,2470.0,149.0


In [12]:
from sklearn import preprocessing
X_normalized = preprocessing.normalize(X)

In [13]:
pd.DataFrame(X_normalized).describe()

Unnamed: 0,0,1
count,25000.0,25000.0
mean,0.997652,0.060862
std,0.004993,0.030993
min,0.651214,0.0
25%,0.997459,0.043267
50%,0.99846,0.05547
75%,0.999064,0.071247
max,1.0,0.758895
