<a href="https://colab.research.google.com/github/zacherymoy/DS-Unit-4-Sprint-1-NLP/blob/master/KaggleDSPT4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# import zipfile

# with zipfile.ZipFile("whiskey-reviews-dspt4.zip","r") as zip_ref:
#     zip_ref.extractall("targetdir")

In [5]:
import pandas as pd
from sklearn.utils import resample
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

#sample_submission_csv = pandas.read_csv('sample_submission.csv')
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

train.head()

Unnamed: 0,id,description,ratingCategory
0,1321,"\nSometimes, when whisky is batched, a few lef...",1
1,3861,\nAn uncommon exclusive bottling of a 6 year o...,0
2,655,\nThis release is a port version of Amrut’s In...,1
3,555,\nThis 41 year old single cask was aged in a s...,1
4,1965,"\nQuite herbal on the nose, with aromas of dri...",1


## Feature Engineering

In [0]:
train['clean_descriptions'] = train['description'].apply(lambda x: x[1:-1].replace(',', ' '))


In [0]:
data = train['clean_descriptions']
target = train['ratingCategory']

In [8]:
train.head()

Unnamed: 0,id,description,ratingCategory,clean_descriptions
0,1321,"\nSometimes, when whisky is batched, a few lef...",1,Sometimes when whisky is batched a few lefto...
1,3861,\nAn uncommon exclusive bottling of a 6 year o...,0,An uncommon exclusive bottling of a 6 year old...
2,655,\nThis release is a port version of Amrut’s In...,1,This release is a port version of Amrut’s Inte...
3,555,\nThis 41 year old single cask was aged in a s...,1,This 41 year old single cask was aged in a she...
4,1965,"\nQuite herbal on the nose, with aromas of dri...",1,Quite herbal on the nose with aromas of dried...


## Load Data 

In [9]:
data.head()

0    Sometimes  when whisky is batched  a few lefto...
1    An uncommon exclusive bottling of a 6 year old...
2    This release is a port version of Amrut’s Inte...
3    This 41 year old single cask was aged in a she...
4    Quite herbal on the nose  with aromas of dried...
Name: clean_descriptions, dtype: object

## Define Pipeline Components

In [0]:
vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=5)
rfc = RandomForestClassifier()


# Define the Pipeline
pipe = Pipeline([
                 #Vectorizer
                 ('vect', vect),
                 # Classifier
                 ('clf', rfc),
                ])

## Define search space

In [11]:
parameters = {
    'vect__max_df': ( 0.75, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': (500,1000),
    'clf__n_estimators':(5, 10, 15),
    'clf__max_depth':(15,20)
}

grid_search = GridSearchCV(pipe,parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(data, target)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   34.9s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  3.1min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=5,
                                                        ngram_range=(1, 2),
                                                        no

In [12]:
grid_search.best_score_


0.7186211106888163

In [22]:
grid_search.predict(['Send me money now', 'you won the lottery'])


array([1, 1])

## Submission File

In [0]:
# Predictions on test sample
pred = grid_search.predict(test['description'])

In [0]:
submission = pd.DataFrame({'id': test['id'], 'ratingCategory':pred})
submission['ratingCategory'] = submission['ratingCategory'].astype('int64')

In [0]:
submission.to_csv(f'submission.csv', index=False)


## Version 2

In [17]:
!python -m spacy download en_core_web_lg


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [0]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [0]:
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

In [0]:
X = get_word_vectors(train['description'])



In [0]:
X_test = get_word_vectors(test['description'])


In [25]:
rfc.fit(X, train['ratingCategory'])


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [26]:
rfc.score(X, train['ratingCategory'])


1.0

In [27]:
rfc.predict(X_test)


array([1, 1, 1, ..., 1, 1, 0])

In [0]:
test['ratingCategory'] = rfc.predict(X_test)


In [0]:
test[['id', 'ratingCategory']].to_csv('testSolutionSubmission2.csv', header=True, index=False)
