# Exercises

## Ex: CountVectorizer

In [None]:
from sklearn.datasets import load_files
imdb_train = load_files('aclimdb/train')
X_train,y_train = imdb_train.data[:10000], imdb_train.target[:10000]
len(X_train)

In [None]:
imdb_test = load_files('aclimdb/test')
X_test,y_test = imdb_test.data[:2500], imdb_test.target[:2500]
len(X_test)

In [None]:
# from sklearn.datasets import fetch_20newsgroups
# dataset = fetch_20newsgroups(shuffle=True, random_state=1,remove=('headers', 'footers', 'quotes'))
# X,y = dataset.data, dataset.target

# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [None]:
X_test[1]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer().fit(X_train)
X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test)

In [None]:
cv.get_feature_names()[:50]

In [None]:
cv.get_feature_names()[50000:50050]

In [None]:
X_train_cv[150].nonzero()[1]

## Ex: Cross Validation

In [None]:
from sklearn.svm import SVC
clf = SVC().fit(X_train_cv,y_train)

In [None]:
clf.score(X_test_cv,y_test)

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X_train_cv, y_train, cv=5)
scores.mean(), scores.std()

## Ex: Pipeline

In [None]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(CountVectorizer(),SVC()).fit(X_train,y_train)

pipe.score(X_test,y_test)

## Ex: GridSearchCV

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC

pipe = Pipeline([('cv', CountVectorizer()), ('svc', SVC())])

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'svc__kernel':('linear', 'svc__rbf'), 'C':[1, 10, 100, 1000]}

clf = GridSearchCV(pipe,param_grid,cv=5).fit(X_train,y_train)

print(clf.best_params_)    
print(clf.best_score_)

## Ex: RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV
param_grid = {'svc__kernel':('linear', 'rbf'), 'svc__C':[1, 10, 100, 1000]}

clf = RandomizedSearchCV(pipe,param_grid,cv=5).fit(X_train,y_train)

print(clf.best_params_)    
print(clf.best_score_)

## Ex: Ensemble Estimators

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier

pipe = Pipeline([('cv', CountVectorizer()), ('gb', GradientBoostingClassifier())])

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {'gb__n_estimators':range(10,100,10)}
clf = RandomizedSearchCV(pipe,param_grid,cv=5).fit(X_train,y_train)

In [None]:
print(clf.best_params_)    
print(clf.best_score_)