In [115]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier

In [2]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
from textblob import TextBlob

In [28]:
import spacy
from spacy.tokenizer import Tokenizer

nlp = spacy.load("en_core_web_lg")

In [8]:
train['sentiment_subjectivity'] = train['description'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

In [9]:
def tokenize(doc):
    blob = TextBlob(doc)
    words = TextBlob(doc).words
    #words = TextBlob(words.correct()).lemmatize()

    return [word.lemmatize() for word in words]
    #return words.correct()

In [10]:
def tokenize2(doc):
    return [doc for doc in nlp(doc)]

In [15]:
tokenize("4/16/1993 helloo there my goood fellowss and friends I'm John's wolves f2$#/n/ef \bb [f12] f142 122 p3n1s")

['4/16/1993',
 'helloo',
 'there',
 'my',
 'goood',
 'fellow',
 'and',
 'friend',
 'I',
 "'m",
 'John',
 "'s",
 'wolf',
 'f2',
 'n/ef',
 '\x08b',
 'f12',
 'f142',
 '122',
 'p3n1s']

In [27]:
tokenize2("4/16/1993 helloo the there my goood fellowss and friends I'm John's wolves f2$#/n/ef \bb [f12] f142 122 p3n1s")

[4/16/1993,
 helloo,
 the,
 there,
 my,
 goood,
 fellowss,
 and,
 friends,
 I,
 'm,
 John,
 's,
 wolves,
 f2$#/n,
 /,
 ef,
 b,
 [,
 f12,
 ],
 f142,
 122,
 p3n1s]

In [17]:
train['description'].apply(lambda x: tokenize(x))

0       [Sometimes, when, whisky, is, batched, a, few,...
1       [An, uncommon, exclusive, bottling, of, a, 6, ...
2       [This, release, is, a, port, version, of, Amru...
3       [This, 41, year, old, single, cask, wa, aged, ...
4       [Quite, herbal, on, the, nose, with, aroma, of...
                              ...                        
4082    [What, lie, beneath, the, surface, of, Dewar, ...
4083    [After, 6, to, 7, year, of, maturation, in, bo...
4084    [Bright, delicate, and, approachable, While, n...
4085    [I, ’, m, calling, this, the, pitmaster, ’, s,...
4086    [Spicy, sultana, greengage, plum, toffee, and,...
Name: description, Length: 4087, dtype: object

In [9]:
def get_word_vectors(docs):
    "Returns a 300 word term-doc matrix"
    return [nlp(doc).vector for doc in docs]

In [245]:
extend_stop_words = [' ', '', ',', '’ s', 's', 'd', 'doe', 'ha', 'le', 'll', 'm', 'n', 't', 'u', 've', 'wa', '‘', '’']
STOP_WORDS = nlp.Defaults.stop_words.union(extend_stop_words)

vect = TfidfVectorizer(stop_words=STOP_WORDS, tokenizer=tokenize, ngram_range=(1,2), sublinear_tf=True, 
                       min_df=.01, max_features=10000, max_df=.75)
#clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42)

clf = RandomForestClassifier(n_estimators=375, max_depth=130)
#clf = XGBClassifier(n_estimators=375, max_depth=130)

pipe = Pipeline([
                 ('vect', vect), ('clf', clf)
                ])

In [112]:
dtm = vect.fit_transform(train['description'])
dtm = pd.DataFrame(data=dtm.todense(), columns=vect.get_feature_names())

In [21]:
x_trainnn = vect.fit_transform(train['description']).todense()

In [113]:
dtm.sum().sort_values(ascending=False)

oak               85.344731
fruit             83.688925
whisky            82.388973
finish            82.203024
note              81.800809
                    ...    
old integrates     0.152793
old feel 17        0.152793
old duty free      0.152793
old duty           0.152793
portfolio 17       0.152793
Length: 30000, dtype: float64

In [249]:
target = train['ratingCategory']
features = train.drop(columns=['ratingCategory', 'id'])
features = train['description']

parameters = {
    'vect__min_df': (.75, .85),
    'vect__min_df': (.01, .05),
    'clf__alpha': (1e-2, 1e-3)
}

parameters = {
    'vect__tokenizer': ('word', tokenize),
    'vect__stop_words':('english', STOP_WORDS)
    #'vect__max_df': (.7, .75, .8),
    #'vect__min_df': (.005, .01, .015),
    #'vect__max_features': (10000, 12000),
    #'clf__max_depth':(130,140),
    #'clf__n_estimators':(375,450)
}


grid_search = GridSearchCV(pipe, parameters, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(features, target)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:    8.5s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    8.7s finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(max_df=0.89,
                                                        max_features=10000,
                                                        min_df=0.01,
                                                        ngram_range=(1, 2),
                                                        stop_words={'', ' ',
                                                                    "'d", "'ll",
                                                                    "'m", "'re",
                                                                    "'s", "'ve",
                                                                    ',', 'a',
                                                                    'about',
                                                                    'above',
                                                                    'across',
             

In [250]:
grid_search.best_score_

0.7440678386085802

In [251]:
grid_search.best_params_

{'vect__stop_words': {'',
  ' ',
  "'d",
  "'ll",
  "'m",
  "'re",
  "'s",
  "'ve",
  ',',
  'a',
  'about',
  'above',
  'across',
  'after',
  'afterwards',
  'again',
  'against',
  'all',
  'almost',
  'alone',
  'along',
  'already',
  'also',
  'although',
  'always',
  'am',
  'among',
  'amongst',
  'amount',
  'an',
  'and',
  'another',
  'any',
  'anyhow',
  'anyone',
  'anything',
  'anyway',
  'anywhere',
  'are',
  'around',
  'as',
  'at',
  'back',
  'be',
  'became',
  'because',
  'become',
  'becomes',
  'becoming',
  'been',
  'before',
  'beforehand',
  'behind',
  'being',
  'below',
  'beside',
  'besides',
  'between',
  'beyond',
  'both',
  'bottom',
  'but',
  'by',
  'ca',
  'call',
  'can',
  'cannot',
  'could',
  'd',
  'did',
  'do',
  'doe',
  'does',
  'doing',
  'done',
  'down',
  'due',
  'during',
  'each',
  'eight',
  'either',
  'eleven',
  'else',
  'elsewhere',
  'empty',
  'enough',
  'even',
  'ever',
  'every',
  'everyone',
  'everything',

In [252]:
tokenize

<function __main__.tokenize(doc)>

In [253]:
# Predictions on test sample
pred = grid_search.predict(test['description'])
pred

array([1, 1, 1, ..., 1, 0, 0], dtype=int64)

In [254]:
submission = pd.DataFrame({'id': test['id'], 'ratingCategory':pred})
submission['ratingCategory'] = submission['ratingCategory'].astype('int64')

In [255]:
# Make Sure the Category is an Integer
submission.head()

Unnamed: 0,id,ratingCategory
0,3461,1
1,2604,1
2,3341,1
3,3764,1
4,2306,1


In [256]:
subNumber = 3

In [257]:
# Save your Submission File
# Best to Use an Integer or Timestamp for different versions of your model

submission.to_csv(f'submission{subNumber}.csv', index=False)
subNumber += 1

In [52]:
### Other Algo
svd = TruncatedSVD(n_components=100,
                  algorithm='randomized',
                  n_iter=10)

rfc = RandomForestClassifier()

In [53]:
target = train['ratingCategory']
features = train.drop(columns=['ratingCategory', 'id'])
features = train['description']
#features = pd.concat([train['sentiment_subjectivity'],pd.DataFrame(vect.fit_transform(train['description']).todense())],axis=1)

parameters = {
    'lsi__svd__n_components': [10,100,250],
    'lsi__vect__max_df': (0.9, 1.0),
    'clf__max_depth':(5,10)
}


In [54]:
lsi = Pipeline([('vect', vect), ('svd', svd)])

pipe2 = Pipeline([('lsi', lsi), ('clf', rfc)])

In [138]:
grid_search = GridSearchCV(pipe2,parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(features, target)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


ValueError: Invalid parameter vect for estimator Pipeline(steps=[('lsi',
                 Pipeline(steps=[('vect',
                                  TfidfVectorizer(ngram_range=(1, 2),
                                                  stop_words={'', ' ', "'d",
                                                              "'ll", "'m",
                                                              "'re", "'s",
                                                              "'ve", ',', 'a',
                                                              'about', 'above',
                                                              'across', 'after',
                                                              'afterwards',
                                                              'again',
                                                              'against', 'all',
                                                              'almost', 'alone',
                                                              'along',
                                                              'already', 'also',
                                                              'although',
                                                              'always', 'am',
                                                              'among',
                                                              'amongst',
                                                              'amount', 'an', ...},
                                                  tokenizer=<function tokenize at 0x000001E0B9256C10>)),
                                 ('svd',
                                  TruncatedSVD(n_components=100, n_iter=10))])),
                ('clf', RandomForestClassifier())]). Check the list of available parameters with `estimator.get_params().keys()`.

In [137]:
grid_search.best_score_

AttributeError: 'GridSearchCV' object has no attribute 'best_score_'