# TfidfVectorizer

In [13]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv('train.csv', sep='\t', index_col=0)
df

Unnamed: 0_level_0,Score,Text
idx,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Positive,В Альфа-Банке работает замечательная девушка -...
1,Negative,Оформляя рассрочку в м. Видео в меге тёплый ст...
2,Positive,Очень порадовала оперативность работы в банке....
3,Negative,Имела неосторожность оформить потреб. кредит в...
4,Negative,Небольшая предыстория: Нашел на сайте MDM банк...
...,...,...
13994,Positive,"О высокой надёжности МКБ, порядочности и добро..."
13995,Positive,"Обслуживаюсь в офисе на Чернореченской 42а, ка..."
13996,Positive,Попала сегодня в очень неприятную ситуацию. Ре...
13997,Positive,Добрый день! Давно являюсь клиентом банка Русс...


In [4]:
le = LabelEncoder()
df['Score'] = le.fit_transform(df['Score'])
df

Unnamed: 0_level_0,Score,Text
idx,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,В Альфа-Банке работает замечательная девушка -...
1,0,Оформляя рассрочку в м. Видео в меге тёплый ст...
2,1,Очень порадовала оперативность работы в банке....
3,0,Имела неосторожность оформить потреб. кредит в...
4,0,Небольшая предыстория: Нашел на сайте MDM банк...
...,...,...
13994,1,"О высокой надёжности МКБ, порядочности и добро..."
13995,1,"Обслуживаюсь в офисе на Чернореченской 42а, ка..."
13996,1,Попала сегодня в очень неприятную ситуацию. Ре...
13997,1,Добрый день! Давно являюсь клиентом банка Русс...


In [5]:
df.isna().sum()

Score    0
Text     0
dtype: int64

In [6]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X = vectorizer.fit_transform(df['Text'])
X

<13999x106454 sparse matrix of type '<class 'numpy.float64'>'
	with 2171131 stored elements in Compressed Sparse Row format>

In [7]:
y = np.array(df['Score'])
y

array([1, 0, 1, ..., 1, 1, 0])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01)

## SVC

In [9]:
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)

In [10]:
y_pred = svc.predict(X_test)
accuracy_score(y_test, y_pred)

0.9642857142857143

## random forest

In [13]:
rfc_rs = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid={
        # 'n_estimators': [20, 50, 100, 200, 250, 300],
        'n_estimators': [20, 50, 100],
        # 'max_depth': [7, 8, 10, 20, 25, None],
        'max_depth': [10, 20, None],
        'max_features': ['sqrt', 'log2', None],
        'criterion': ['gini', 'entropy', 'log_loss'],
        # 'min_samples_split': [1, 2, 5, 10],
        'min_samples_split': [1, 5],
        'n_jobs': [-1]
    },
    scoring='f1',
    verbose=52,
    cv=2
)
rfc_rs.fit(X, y)

Fitting 2 folds for each of 162 candidates, totalling 324 fits
[CV 1/2; 1/162] START criterion=gini, max_depth=10, max_features=sqrt, min_samples_split=1, n_estimators=20, n_jobs=-1
[CV 1/2; 1/162] END criterion=gini, max_depth=10, max_features=sqrt, min_samples_split=1, n_estimators=20, n_jobs=-1;, score=nan total time=   0.0s
[CV 2/2; 1/162] START criterion=gini, max_depth=10, max_features=sqrt, min_samples_split=1, n_estimators=20, n_jobs=-1
[CV 2/2; 1/162] END criterion=gini, max_depth=10, max_features=sqrt, min_samples_split=1, n_estimators=20, n_jobs=-1;, score=nan total time=   0.0s
[CV 1/2; 2/162] START criterion=gini, max_depth=10, max_features=sqrt, min_samples_split=1, n_estimators=50, n_jobs=-1
[CV 1/2; 2/162] END criterion=gini, max_depth=10, max_features=sqrt, min_samples_split=1, n_estimators=50, n_jobs=-1;, score=nan total time=   0.0s
[CV 2/2; 2/162] START criterion=gini, max_depth=10, max_features=sqrt, min_samples_split=1, n_estimators=50, n_jobs=-1
[CV 2/2; 2/162] E

162 fits failed out of a total of 324.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
162 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ffedo\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ffedo\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\ffedo\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\ffedo\AppData\Local\Programs\Python\Python311\L

In [14]:
y_pred = rfc_rs.predict(X_test)
accuracy_score(y_test, y_pred)

1.0

## knn

In [42]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

In [43]:
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)

0.8142857142857143

## catboost

In [10]:
cat = CatBoostClassifier(iterations=1000, learning_rate=0.01, 
                         depth=4, verbose=200,  eval_metric='Accuracy',
                         l2_leaf_reg=4, od_wait=1000, use_best_model = True, task_type='GPU')
cat.fit(X_train, y_train, eval_set=(X_test, y_test))

0:	learn: 0.7949347	test: 0.8071429	best: 0.8071429 (0)	total: 827ms	remaining: 13m 46s
200:	learn: 0.8585035	test: 0.8571429	best: 0.8571429 (130)	total: 1m 46s	remaining: 7m 3s
400:	learn: 0.8872935	test: 0.8785714	best: 0.8857143 (388)	total: 3m 27s	remaining: 5m 9s
600:	learn: 0.9041778	test: 0.8928571	best: 0.8928571 (419)	total: 5m 4s	remaining: 3m 22s
800:	learn: 0.9136301	test: 0.9000000	best: 0.9000000 (632)	total: 6m 41s	remaining: 1m 39s
999:	learn: 0.9229382	test: 0.9000000	best: 0.9000000 (632)	total: 8m 21s	remaining: 0us
bestTest = 0.9
bestIteration = 632
Shrink model to first 633 iterations.


<catboost.core.CatBoostClassifier at 0x2c83dd12710>

In [11]:
y_pred = cat.predict(X_test)
accuracy_score(y_test, y_pred)

0.9