# Preprocessing

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.model_selection import RandomizedSearchCV, cross_validate
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor

In [29]:
dataset_d16 = '/Users/monmac/Desktop/preproc_data_d16.csv'
dataset_d2 = '/Users/monmac/Desktop/preproc_data_d2.csv'
data_d16 = pd.read_csv(dataset_d16)
data_d2 = pd.read_csv(dataset_d2)
data_d16.head()

Unnamed: 0,date,jour,service,CA_TTC,CA_HT,TVA,temp,feels_like,temp_min,temp_max,wind_speed,clouds_all,weather_main,weather_description,match_edf,roland_garros,fashion_week,Match Happening,Match Happening-CL,vacances_paris
0,2019-09-01,Dimanche,midi,3548.0,3191.82,356.18,294.08,293.35,292.94,295.87,2.68,0.0,Clear,sky is clear,0.0,0.0,0.0,,,1.0
1,2019-09-01,Dimanche,soir,4577.1,4035.3,541.8,293.37,292.6,291.95,294.4,4.02,0.0,Clear,sky is clear,0.0,0.0,0.0,,,
2,2019-09-02,Lundi,midi,2089.0,1888.79,200.21,294.26,293.47,292.71,295.87,2.6,0.0,Clear,sky is clear,0.0,0.0,0.0,0.0,,0.0
3,2019-09-02,Lundi,soir,5726.5,5056.44,670.06,294.0,293.08,292.25,295.36,0.45,0.0,Clear,sky is clear,0.0,0.0,0.0,,,
4,2019-09-03,Mardi,midi,1745.25,1567.27,177.98,295.07,294.57,293.18,296.42,0.45,40.0,Clouds,scattered clouds,0.0,0.0,0.0,,,0.0


In [30]:
data_d16.fillna(value = "0.0", inplace = True)
data_d2.fillna(value = "0.0", inplace = True)

In [8]:
weather_description = pd.DataFrame(np.concatenate([data_d16.weather_description.unique(),data_d2.weather_description.unique()])).drop_duplicates()
weather_description.sort_values(by=0,axis=0, inplace=True)

In [9]:
weather_sorted = {
'Clear' : ['sky is clear', np.nan, np.nan,np.nan], 
'Clouds' : ['scattered clouds','few clouds','broken clouds','overcast clouds'],
'Drizzle' : ['light intensity drizzle','drizzle','heavy intensity drizzle',np.nan], 
'Drizzle and Rain' : ['light intensity drizzle rain','rain and drizzle',np.nan,np.nan],
'Fog' : ['fog', np.nan, np.nan,np.nan], 
'Haze' : ['haze', np.nan, np.nan,np.nan], 
'Mist' : ['mist', np.nan, np.nan,np.nan], 
'Rain' : ['light rain','light intensity shower rain','moderate rain', 'heavy intensity rain'], 
'Snow' : ['light snow', np.nan, np.nan,np.nan], 
'Thunderstorm' : ['proximity thunderstorm', 'thunderstorm', 'thunderstorm with light rain','thunderstorm with heavy rain'] 
}

weather_sorted_df = pd.DataFrame(data=weather_sorted)
weather_sorted_df

Unnamed: 0,Clear,Clouds,Drizzle,Drizzle and Rain,Fog,Haze,Mist,Rain,Snow,Thunderstorm
0,sky is clear,scattered clouds,light intensity drizzle,light intensity drizzle rain,fog,haze,mist,light rain,light snow,proximity thunderstorm
1,,few clouds,drizzle,rain and drizzle,,,,light intensity shower rain,,thunderstorm
2,,broken clouds,heavy intensity drizzle,,,,,moderate rain,,thunderstorm with light rain
3,,overcast clouds,,,,,,heavy intensity rain,,thunderstorm with heavy rain


In [10]:
feat_numerical_nunique_d2 = pd.DataFrame(data_d2.select_dtypes(exclude=['object'],include=['int64','float64']).nunique(), columns = ["unique_values"])
feat_numerical_nunique_d2

Unnamed: 0,unique_values
CA_TTC,1297
CA_HT,1314
TVA,1308
temp,1040
feels_like,1081
temp_min,605
temp_max,676
wind_speed,43
clouds_all,7
match_edf,2


In [11]:
feat_categorical_nunique_d2 = pd.DataFrame(data_d2.select_dtypes(include=['object'],exclude=['int64','float64']).nunique(), columns = ["unique_values"])
feat_categorical_nunique_d2

Unnamed: 0,unique_values
date,672
jour,7
service,2
weather_main,8
weather_description,21
Match Happening,2
Match Happening-CL,2
vacances_paris,3


In [28]:
#data.select_dtypes(include=['object'],exclude=['int64','float64']).nunique().index.sort_values

In [35]:
data_d16['Match Happening'] = pd.to_numeric(data_d16['Match Happening'])
data_d16['Match Happening-CL'] = pd.to_numeric(data_d16['Match Happening-CL'])
data_d16['vacances_paris'] = pd.to_numeric(data_d16['vacances_paris'])

In [38]:
data_d2['Match Happening'] = pd.to_numeric(data_d2['Match Happening'])
data_d2['Match Happening-CL'] = pd.to_numeric(data_d2['Match Happening-CL'])
data_d2['vacances_paris'] = pd.to_numeric(data_d2['vacances_paris'])


data_d2['vacances_paris'] = data_d2['vacances_paris'].astype(object)

In [78]:
data_d2['vacances_paris'] = data_d2['vacances_paris'].astype(object)

In [79]:
data_d2.dtypes

date                    object
jour                    object
service                 object
CA_TTC                 float64
CA_HT                  float64
TVA                    float64
temp                   float64
feels_like             float64
temp_min               float64
temp_max               float64
wind_speed             float64
clouds_all             float64
weather_main            object
weather_description     object
match_edf              float64
roland_garros          float64
fashion_week           float64
Match Happening        float64
Match Happening-CL     float64
vacances_paris          object
dtype: object

In [None]:
numerical_columns = 
categorical_columns = 
ordinal_columns =

In [68]:
preproc_numerical_baseline = make_pipeline(
    RobustScaler())

preproc_categorical_baseline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

preproc_baseline = make_column_transformer(
    (preproc_numerical_baseline, make_column_selector(dtype_include=["int64", "float64"])),
    (preproc_categorical_baseline, make_column_selector(pattern=),
    (preproc_ordinal_d16, make_column_selector(pattern=))
    remainder="drop")

NameError: name 'feat_categorical_small' is not defined

In [74]:
d16 = pd.read_csv('/Users/monmac/Desktop/data_d16.csv')

In [76]:
d16.dtypes

date                    object
jour                    object
service                 object
CA_TTC                 float64
CA_HT                  float64
TVA                    float64
temp                   float64
feels_like             float64
temp_min               float64
temp_max               float64
wind_speed             float64
clouds_all             float64
weather_main            object
weather_description     object
match_edf              float64
roland_garros          float64
fashion_week           float64
Match Happening        float64
Match Happening-CL     float64
vacances_paris         float64
Clear                   object
Clouds                  object
Drizzle                 object
Drizzle and Rain        object
Fog                     object
Haze                    object
Mist                    object
Rain                    object
Snow                     int64
Thunderstorm            object
dtype: object

# Model 

In [64]:
X = data_d2.drop(labels=['CA_TTC', 'CA_HT','TVA'], axis=1)
y = data_d2['CA_TTC']

In [54]:
#plt.hist(data_d2['CA_TTC'])

In [50]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

cv_results = cross_validate(model, X, y, cv = 20)

cv_score = cv_results['test_score'].mean()

cv_score

Traceback (most recent call last):
  File "/Users/monmac/.pyenv/versions/3.8.6/envs/resto-project/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/monmac/.pyenv/versions/3.8.6/envs/resto-project/lib/python3.8/site-packages/sklearn/linear_model/_base.py", line 518, in fit
    X, y = self._validate_data(X, y, accept_sparse=accept_sparse,
  File "/Users/monmac/.pyenv/versions/3.8.6/envs/resto-project/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/monmac/.pyenv/versions/3.8.6/envs/resto-project/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/Users/monmac/.pyenv/versions/3.8.6/envs/resto-project/lib/python3.8/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  

nan

In [55]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', RobustScaler()),
    ('model', LinearRegression())
])

In [57]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(
  pipe, 
  param_grid={'imputer__n_neighbors': [4,5,6]
  },
  cv=5,
  scoring="accuracy")

#grid_search.fit(X, y)
#grid_search.best_params_

In [59]:
tuned_pipe = grid_search.best_estimator_

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [58]:
cv_results = cross_validate(tuned_pipe, X,y, cv = 20, scoring = "accuracy")

base_score = cv_results['test_score'].mean()

base_score

NameError: name 'tuned_pipe' is not defined

In [63]:
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(max_depth=2, random_state=2)
tree_clf.fit(X,y)

ValueError: could not convert string to float: 'Mardi'

In [65]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_validate

tree = DecisionTreeRegressor()

cv_results = cross_validate(tree, X, y, scoring = "r2", cv=10)

Traceback (most recent call last):
  File "/Users/monmac/.pyenv/versions/3.8.6/envs/resto-project/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/monmac/.pyenv/versions/3.8.6/envs/resto-project/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 1252, in fit
    super().fit(
  File "/Users/monmac/.pyenv/versions/3.8.6/envs/resto-project/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 157, in fit
    X, y = self._validate_data(X, y,
  File "/Users/monmac/.pyenv/versions/3.8.6/envs/resto-project/lib/python3.8/site-packages/sklearn/base.py", line 430, in _validate_data
    X = check_array(X, **check_X_params)
  File "/Users/monmac/.pyenv/versions/3.8.6/envs/resto-project/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/Users/monmac/.pyenv/versions/3.8.6/envs/resto-project/lib/python3.8/site

In [66]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(n_estimators=100)

cv_results = cross_validate(forest, X, y, scoring = "r2", cv=10)

print(cv_results['test_score'])
print('mean r2: ',cv_results['test_score'].mean())
print('std r2: ', cv_results['test_score'].std())

[nan nan nan nan nan nan nan nan nan nan]
mean r2:  nan
std r2:  nan


Traceback (most recent call last):
  File "/Users/monmac/.pyenv/versions/3.8.6/envs/resto-project/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/monmac/.pyenv/versions/3.8.6/envs/resto-project/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 304, in fit
    X, y = self._validate_data(X, y, multi_output=True,
  File "/Users/monmac/.pyenv/versions/3.8.6/envs/resto-project/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/Users/monmac/.pyenv/versions/3.8.6/envs/resto-project/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/Users/monmac/.pyenv/versions/3.8.6/envs/resto-project/lib/python3.8/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/Users

In [67]:
from xgboost import XGBRegressor()
xgb_reg.fit(
    X_train, y_train,
    # evaluate loss at each iteration
    eval_set=[(X_val, y_val)], 
    # stop iterating when eval loss increases 5 times in a row
    early_stopping_rounds=5
) 
y_pred = xgb_reg.predict(X_val)

SyntaxError: invalid syntax (357369733.py, line 1)

In [82]:
d16=pd.read_csv('/Users/monmac/Desktop/preproc_data_d16.csv')

In [89]:
d16[d16['Match Happening-CL']==0.0]

Unnamed: 0,date,jour,service,CA_TTC,CA_HT,TVA,temp,feels_like,temp_min,temp_max,wind_speed,clouds_all,weather_main,weather_description,match_edf,roland_garros,fashion_week,Match Happening,Match Happening-CL,vacances_paris
34,2019-09-18,Mercredi,midi,4077.5,3663.41,414.09,291.23,290.22,290.36,292.47,2.24,0.0,Clear,sky is clear,0.0,0.0,0.0,0.0,0.0,0.0
105,2019-10-22,Mardi,midi,3671.75,3314.55,357.2,284.21,283.85,281.95,285.96,1.5,90.0,Mist,mist,0.0,0.0,0.0,0.0,0.0,1.0
152,2019-11-12,Mardi,midi,3073.5,2760.77,312.73,281.18,281.18,279.17,282.92,0.45,75.0,Rain,light rain,0.0,0.0,0.0,0.0,0.0,0.0
182,2019-11-26,Mardi,midi,6.0,5.45,0.55,284.94,284.58,283.81,286.13,0.89,90.0,Rain,light rain,0.0,0.0,0.0,0.0,0.0,0.0
195,2019-12-02,Lundi,midi,2383.0,2147.05,235.95,278.87,277.13,277.49,279.87,2.24,90.0,Clouds,overcast clouds,0.0,0.0,0.0,0.0,0.0,0.0
344,2020-02-12,Mercredi,midi,3135.0,2822.8,312.2,282.08,282.08,280.47,283.02,0.89,75.0,Clouds,broken clouds,0.0,0.0,0.0,0.0,0.0,1.0
356,2020-02-18,Mardi,midi,2105.0,1889.85,215.15,283.13,281.12,282.25,284.14,4.02,75.0,Clouds,broken clouds,0.0,0.0,0.0,0.0,0.0,1.0
627,2020-09-12,Samedi,midi,3998.0,3601.67,396.33,293.79,293.42,292.58,295.87,1.34,20.0,Clouds,few clouds,0.0,0.0,0.0,0.0,0.0,0.0
707,2020-10-20,Mardi,midi,2620.5,2352.12,268.38,287.61,287.38,286.87,288.16,0.45,1.0,Rain,light rain,0.0,0.0,0.0,0.0,0.0,1.0
720,2020-10-28,Mercredi,midi,3256.5,2936.59,319.91,286.61,286.1,285.8,287.47,2.68,75.0,Clouds,broken clouds,0.0,0.0,0.0,0.0,0.0,1.0
