# Vectorisation de type TF-IDF sur les données textuelles

## Import des librairies et des data

### Import des librairies

In [1]:
import os
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__name__)))
DATA_DIR = os.path.join(BASE_DIR, 'data')
print(BASE_DIR, DATA_DIR)
os.chdir(BASE_DIR)


c:\Users\mangg\projects\RakutenTeam c:\Users\mangg\projects\RakutenTeam\data


### Import des librairies nécessaires

In [None]:
import numpy as np
import pandas as pd
from src.features.text.transformers.text_merger import TextMerger
from src.features.text.transformers.extractors import YearExtractor, NumberExtractor, HashtagNumberExtractor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from scipy.sparse import csr_matrix, csr_array
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

### Import des données

In [2]:
df = pd.read_csv(os.path.join(DATA_DIR, "clean/X_train.csv"), index_col=0)
target = pd.read_csv(os.path.join(DATA_DIR, "clean/Y_train.csv"), index_col=0)["prdtypecode"]

## Feature Engineering

### Text Merging

In [3]:

merger = TextMerger(designation_column="designation", description_column="description", merged_column="full_description")
merged_text = merger.fit_transform(df)
df["full_description"] = merged_text

### TF-IDF Vectorization

In [4]:
vectorizer = TfidfVectorizer(max_features=1000)
vectorized_text = pd.DataFrame(data=vectorizer.fit_transform(df.full_description).toarray(), columns=vectorizer.get_feature_names_out(), index=df.index)
vectorized_text.head()

Unnamed: 0,01,04,05,10,100,11,12,120,122,13,...,élégant,élément,éléments,énergie,épaisseur,équipement,étanche,été,éviter,être
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.091744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.226494


### Extracting 'N°' info

In [5]:
number_extractor = NumberExtractor(text_column="full_description")
numbers = number_extractor.fit_transform(df)
numbers.head()

0    0
1    1
2    0
3    0
4    0
Name: has_number, dtype: int8

### Extracting year info

In [6]:
year_extractor = YearExtractor(text_column="full_description")
years = year_extractor.fit_transform(df)
years.head()


Unnamed: 0,year_val,has_year
0,0,0
1,2001,1
2,0,0
3,2000,1
4,0,0


### Extracting Hashtag number info

In [7]:
hashtags_extractor = HashtagNumberExtractor(text_column="full_description")
hashtags = hashtags_extractor.fit_transform(df)
hashtags.head()

0    0
1    0
2    0
3    0
4    0
Name: has_hashtag, dtype: int8

### Merging and Scaling extracted features

In [8]:
extracted_features = pd.concat([numbers, years, hashtags], axis=1)
extracted_features.describe().round(3)

Unnamed: 0,has_number,year_val,has_year,has_hashtag
count,84916.0,84916.0,84916.0,84916.0
mean,0.063,265.12,0.138,0.005
std,0.244,663.551,0.345,0.069
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0
max,1.0,2024.0,1.0,1.0


In [9]:
scaler = MinMaxScaler()
scaled_features = pd.DataFrame(data=scaler.fit_transform(extracted_features), columns=extracted_features.columns, index=extracted_features.index)
scaled_features.describe().round(3)

Unnamed: 0,has_number,year_val,has_year,has_hashtag
count,84916.0,84916.0,84916.0,84916.0
mean,0.063,0.131,0.138,0.005
std,0.244,0.328,0.345,0.069
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0


### Features Merging

In [10]:
"""data = pd.concat([vectorized_text, scaled_features], axis=1)
data.head()"""
data = pd.concat([vectorized_text, scaled_features], axis=1)
data.head()

Unnamed: 0,01,04,05,10,100,11,12,120,122,13,...,épaisseur,équipement,étanche,été,éviter,être,has_number,year_val,has_year,has_hashtag
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.988636,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.988142,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.226494,0.0,0.0,0.0,0.0


## Model Selection

### Separating Training Set and Test Set

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data, target.values, test_size=0.2, random_state=42)

### Set X_train and X_test as Sparse Matrices

In [12]:
X_train = csr_matrix(X_train)
X_test = csr_matrix(X_test)

### Defining models to experiment with

In [13]:
clf_svc = LinearSVC(random_state=42)
clf_rf = RandomForestClassifier(random_state=42)
clf_lr = LogisticRegression(random_state=42)
clf_knn = KNeighborsClassifier()
clf_dum = DummyClassifier(random_state=42)

### Défining Param Grids for each classifier

In [14]:
svc_params = {"C":np.logspace(-2, 2, 5)}
rf_params = {"n_estimators": [10, 100], "max_depth": [None, 5, 10, 20, 30], "min_samples_split": [2, 5, 10], "min_samples_leaf": [1, 2, 4]}
lr_params = {"C": [0.1, 1, 10, 100], "penalty": ["l1", "l2"]}
knn_params = {"n_neighbors": [3, 5, 11, 19], "weights": ["uniform", "distance"], "metric": ["euclidean", "manhattan"]}
dum_params = {"strategy": ["stratified", "most_frequent", "prior", "uniform"]}

### Setting classifiers tab

In [15]:
classifiers = [
    ("Dummy", clf_dum, dum_params),
    
    ("Random Forest", clf_rf, rf_params),
    ("Logistic Regression", clf_lr, lr_params),
    ("KNN", clf_knn, knn_params),
    ("Linear SVC", clf_svc, svc_params),
    
]

### Setting results tab

In [16]:
results = pd.DataFrame(columns=["Accuracy", "Best params"], index=["Dummy", "Random Forest", "Logistic Regression", "KNN", "Linear SVC",])
best_models = []
best_params = []

In [17]:
clf = LinearSVC()

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
clf.score(X_test, y_test)
report = classification_report(y_test, y_pred, output_dict=True)
report = pd.DataFrame(report).T



In [18]:
print(report)

              precision    recall  f1-score       support
10             0.381849  0.364379  0.372910    612.000000
40             0.616967  0.460653  0.527473    521.000000
50             0.663636  0.613445  0.637555    357.000000
60             0.867925  0.857143  0.862500    161.000000
1140           0.661972  0.610390  0.635135    539.000000
1160           0.460657  0.767176  0.575656    786.000000
1180           0.586207  0.232877  0.333333    146.000000
1280           0.622837  0.561915  0.590810    961.000000
1281           0.538462  0.396226  0.456522    424.000000
1300           0.859649  0.855236  0.857437    974.000000
1301           0.780000  0.692308  0.733542    169.000000
1302           0.674157  0.591716  0.630252    507.000000
1320           0.696707  0.598214  0.643715    672.000000
1560           0.745527  0.740375  0.742942   1013.000000
1920           0.860303  0.878716  0.869412    841.000000
1940           0.538462  0.562044  0.550000    137.000000
2060          

### Exploring Grid Search CV for different models

In [19]:
for name, clf, params in classifiers:
    print (f"Training {name}...")
    grid = GridSearchCV(clf, params, cv=5, n_jobs=3, verbose=3)
    grid.fit(X_train, y_train)
    print(f"Meilleurs paramètres pour {name}: {grid.best_params_}")
    print(f"Meilleur score pour {name}: {grid.best_score_:.3f}")
    test_score = grid.score(X_test, y_test)
    print(f"Score sur le test set pour {name}: {test_score:.3f}")
    results.loc[name, "Accuracy"] = test_score
    best_models.append({name: grid.best_estimator_})
    best_params.append({name: grid.best_params_})
    print(best_models, best_params)

Training Dummy...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Meilleurs paramètres pour Dummy: {'strategy': 'most_frequent'}
Meilleur score pour Dummy: 0.120
Score sur le test set pour Dummy: 0.121
[{'Dummy': DummyClassifier(random_state=42, strategy='most_frequent')}] [{'Dummy': {'strategy': 'most_frequent'}}]
Training Random Forest...
Fitting 5 folds for each of 90 candidates, totalling 450 fits
Meilleurs paramètres pour Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Meilleur score pour Random Forest: 0.719
Score sur le test set pour Random Forest: 0.717
[{'Dummy': DummyClassifier(random_state=42, strategy='most_frequent')}, {'Random Forest': RandomForestClassifier(min_samples_split=5, random_state=42)}] [{'Dummy': {'strategy': 'most_frequent'}}, {'Random Forest': {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}}]
Training Logistic Regression...
Fitting 5 folds for each of 8

20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\mangg\anaconda3\envs\Rakuten\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\mangg\anaconda3\envs\Rakuten\lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\mangg\anaconda3\envs\Rakuten\lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\mangg\anaconda3\envs\Rakuten\lib\site-packages\sk

Meilleurs paramètres pour Logistic Regression: {'C': 10, 'penalty': 'l2'}
Meilleur score pour Logistic Regression: 0.712
Score sur le test set pour Logistic Regression: 0.710
[{'Dummy': DummyClassifier(random_state=42, strategy='most_frequent')}, {'Random Forest': RandomForestClassifier(min_samples_split=5, random_state=42)}, {'Logistic Regression': LogisticRegression(C=10, random_state=42)}] [{'Dummy': {'strategy': 'most_frequent'}}, {'Random Forest': {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}}, {'Logistic Regression': {'C': 10, 'penalty': 'l2'}}]
Training KNN...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Meilleurs paramètres pour KNN: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
Meilleur score pour KNN: 0.642


found 0 physical cores < 1
  File "c:\Users\mangg\anaconda3\envs\Rakuten\lib\site-packages\joblib\externals\loky\backend\context.py", line 217, in _count_physical_cores
    raise ValueError(


Score sur le test set pour KNN: 0.649
[{'Dummy': DummyClassifier(random_state=42, strategy='most_frequent')}, {'Random Forest': RandomForestClassifier(min_samples_split=5, random_state=42)}, {'Logistic Regression': LogisticRegression(C=10, random_state=42)}, {'KNN': KNeighborsClassifier(metric='euclidean', weights='distance')}] [{'Dummy': {'strategy': 'most_frequent'}}, {'Random Forest': {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}}, {'Logistic Regression': {'C': 10, 'penalty': 'l2'}}, {'KNN': {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}}]
Training Linear SVC...
Fitting 5 folds for each of 5 candidates, totalling 25 fits




Meilleurs paramètres pour Linear SVC: {'C': 10.0}
Meilleur score pour Linear SVC: 0.711
Score sur le test set pour Linear SVC: 0.706
[{'Dummy': DummyClassifier(random_state=42, strategy='most_frequent')}, {'Random Forest': RandomForestClassifier(min_samples_split=5, random_state=42)}, {'Logistic Regression': LogisticRegression(C=10, random_state=42)}, {'KNN': KNeighborsClassifier(metric='euclidean', weights='distance')}, {'Linear SVC': LinearSVC(C=10.0, random_state=42)}] [{'Dummy': {'strategy': 'most_frequent'}}, {'Random Forest': {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}}, {'Logistic Regression': {'C': 10, 'penalty': 'l2'}}, {'KNN': {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}}, {'Linear SVC': {'C': 10.0}}]




### Display Results

In [20]:
results

Unnamed: 0,Accuracy,Best params
Dummy,0.120525,
Random Forest,0.717322,
Logistic Regression,0.710139,
KNN,0.64861,
Linear SVC,0.706312,


### Analysing Results of best estimator