In [33]:

import numpy as np
import pandas as pd 


In [34]:
df = pd.read_csv("./datasets/tips.csv")

# Regresyon için Önişleme

In [35]:
df = pd.concat([df, pd.get_dummies(df["sex"],prefix="sex")], axis=1)
df = pd.concat([df, pd.get_dummies(df["day"],prefix="day")], axis=1)
df = pd.concat([df, pd.get_dummies(df["time"],prefix="time")], axis=1)
df = pd.concat([df, pd.get_dummies(df["smoker"],prefix="smoker")], axis=1)
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_Female,sex_Male,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,smoker_No,smoker_Yes
0,16.99,1.01,Female,No,Sun,Dinner,2,1,0,0,0,1,0,1,0,1,0
1,10.34,1.66,Male,No,Sun,Dinner,3,0,1,0,0,1,0,1,0,1,0
2,21.01,3.5,Male,No,Sun,Dinner,3,0,1,0,0,1,0,1,0,1,0
3,23.68,3.31,Male,No,Sun,Dinner,2,0,1,0,0,1,0,1,0,1,0
4,24.59,3.61,Female,No,Sun,Dinner,4,1,0,0,0,1,0,1,0,1,0


In [36]:
columns_to_scale = ['tip', 'size', 'total_bill']

In [37]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_columns = pd.DataFrame(scaler.fit_transform(df[columns_to_scale]),columns=columns_to_scale)
scaled_columns.describe()

Unnamed: 0,tip,size,total_bill
count,244.0,244.0,244.0
mean,0.222031,0.313934,0.350145
std,0.153738,0.19022,0.186477
min,0.0,0.0,0.0
25%,0.111111,0.2,0.215281
50%,0.211111,0.2,0.308442
75%,0.284722,0.4,0.441087
max,1.0,1.0,1.0


In [38]:
df.drop(["total_bill", "tip", "size", "smoker", "sex","day", "time"], axis = 1, inplace = True)
df = pd.concat([df, scaled_columns], axis = 1)

In [39]:
df.head()

Unnamed: 0,sex_Female,sex_Male,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch,smoker_No,smoker_Yes,tip,size,total_bill
0,1,0,0,0,1,0,1,0,1,0,0.001111,0.2,0.291579
1,0,1,0,0,1,0,1,0,1,0,0.073333,0.4,0.152283
2,0,1,0,0,1,0,1,0,1,0,0.277778,0.4,0.375786
3,0,1,0,0,1,0,1,0,1,0,0.256667,0.2,0.431713
4,1,0,0,0,1,0,1,0,1,0,0.29,0.6,0.450775


In [40]:
df.drop(["sex_Female","time_Dinner", "smoker_No"], axis=1, inplace=True)

In [41]:
Y = df.tip
X = df.loc[:,df.columns!="tip"]

# **Model Eğitme**

In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1)

In [43]:
from sklearn.svm import SVR
svregressor = SVR()
svregressor.fit(X_train, y_train)
predsvr = svregressor.predict(X_test)

## Değerlendirme

In [44]:
from sklearn.metrics import r2_score
print(r2_score(y_test, predsvr))

0.37695369472935947


In [45]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test, predsvr))

0.011150782488626578


In [46]:
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, predsvr))

0.07355547461567172


# Doğrulama 

In [47]:
from sklearn.model_selection import cross_val_score
cross_val_score(svregressor, X, Y, cv=10).mean()

0.3211912064777266

# Tuning

Grid Search Cross Validation

In [48]:
import sklearn
from sklearn import metrics
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [49]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
param_grid = [ {"C":[0.0001, 0.01, 0.1, 1, 10], "kernel":["poly", "linear", "rbf"], "gamma":[0.0001, 0.01, 0.1, 1, 10]}]
grid_search = GridSearchCV(svregressor, param_grid, cv=5)
grid_search.fit(X, Y)
grid_search.best_estimator_

SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [50]:
grid_search.best_score_

0.3871538009583566

# # Random Search Cross Validation
Burada 0'la dört arası bir uniform distribution içinde C değerlerini arıyoruz.
uniform[loc, loc+scale]

In [51]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
distributions = dict(C = uniform(loc = 0, scale = 4), kernel=["poly","rbf","linear"])
random = RandomizedSearchCV(svregressor, distributions, random_state=0)
search = random.fit(X, Y)
search.best_params_

{'C': 1.5337660753031108, 'kernel': 'linear'}

In [52]:
df.head()

Unnamed: 0,sex_Male,day_Fri,day_Sat,day_Sun,day_Thur,time_Lunch,smoker_Yes,tip,size,total_bill
0,0,0,0,1,0,0,0,0.001111,0.2,0.291579
1,1,0,0,1,0,0,0,0.073333,0.4,0.152283
2,1,0,0,1,0,0,0,0.277778,0.4,0.375786
3,1,0,0,1,0,0,0,0.256667,0.2,0.431713
4,0,0,0,1,0,0,0,0.29,0.6,0.450775


# Sınıflandırma

In [53]:
Y = df.sex_Male
X = df.loc[:,df.columns!="sex_Male"]

In [54]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1, stratify = Y)

In [55]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)
predsvc = svc.predict(X_test)

In [56]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predsvc))

precision    recall  f1-score   support

           0       0.67      0.44      0.53         9
           1       0.74      0.88      0.80        16

    accuracy                           0.72        25
   macro avg       0.70      0.66      0.67        25
weighted avg       0.71      0.72      0.70        25



In [57]:
from sklearn.model_selection import cross_val_score
cross_val_score(svc, X, Y, cv=10).mean()

0.6025

Stratified K Fold

In [58]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10)
scores = cross_val_score(svc, X, Y, cv=skf)
print("skorlar:\n{}".format(scores))
print("skorların ortalaması:\n{}".format(scores.mean()))

skorlar:
[0.64       0.64       0.68       0.44       0.33333333 0.58333333
 0.75       0.66666667 0.625      0.66666667]
skorların ortalaması:
0.6025


In [59]:
from sklearn.model_selection import GridSearchCV
param_grid = [ {"C":[0.0001, 0.01, 0.1, 1, 10], "kernel":["poly", "linear", "rbf"], "gamma":[0.0001, 0.01, 0.1, 1, 10]}]
grid_search_c = GridSearchCV(svc, param_grid, cv = 5, scoring = "accuracy")
grid_search_c.fit(X, Y)
grid_search_c.best_estimator_

SVC(C=0.0001, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [28]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
distributions = dict(C = uniform(loc = 0, scale = 4), kernel=["poly","rbf","linear"])
random_c = RandomizedSearchCV(svc, distributions, random_state=0)
search_c = random_c.fit(X, Y)
search_c.best_params_

NameError: name 'X' is not defined

In [29]:
search_c.best_score_

NameError: name 'search_c' is not defined