In [1]:
#import the data
import pandas as pd
from pandas.core.frame import DataFrame
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import auc
from sklearn.metrics import roc_curve

from datetime import datetime

from hyperopt import tpe, hp, fmin, STATUS_OK,Trials

original_data = pd.read_csv("历史锁券合约信息.csv")
scale = pd.read_csv("规模.csv", encoding='gbk')

# Dependent Variable

In [2]:
#construct a binary variable to denote the result if a customer claimed a securities successfully
#0 means not obtained, 1 means obtained
y_0 = original_data["订单状态"]
y_1 = [None] * len(y_0)
for i in range(len(y_0)):
    if y_0[i] == "5-已锁券":
        y_1[i]=1
    else:
        y_1[i]=0
y_df = DataFrame(y_1)
y_rename = y_df.rename(columns = {'0':'订单状态'})

In [3]:
y_rename

Unnamed: 0,0
0,1
1,0
2,0
3,0
4,0
...,...
31556,1
31557,1
31558,1
31559,1


# Independent Variables

In [4]:
#Filter by area, randomly denote a security with place1, place2 or place3
original_data['地域'] = np.random.randint(1, 4, original_data.shape[0])
zone_df = original_data['地域'].to_numpy()

In [5]:
#Filter by appointment time
application_time = pd.to_datetime(original_data['申请时间'])
application_time_str = application_time_num = [0] * original_data.shape[0]
for i in range(len(original_data)):
    application_time_str[i] = application_time[i].strftime('%Y%m%d%H%M%S')
    application_time_num[i] = int(application_time_str[i])
original_data['申请时间'] = application_time_str
data_grouped = original_data.groupby(["证券代码","有限期起始日"])
original_data['时间排序'] = data_grouped['申请时间'].rank(ascending=True)
time_df = original_data['时间排序'].to_frame()

In [6]:
#Filter by rate
original_data['费率排序'] = data_grouped['最高费率%'].rank(ascending=True)
rate_df = original_data['费率排序'].to_frame()

In [7]:
rate_df

Unnamed: 0,费率排序
0,1.0
1,1.5
2,1.5
3,1.0
4,1.0
...,...
31556,1.0
31557,1.0
31558,1.0
31559,1.0


In [8]:
#Filter by scale
data_combined = pd.merge(original_data, scale, how='left', left_on='股东名称', right_on='账户名称' )
size_fl = data_combined['汇总'].to_frame()
size_1d = np.ravel(size_fl)
size_1d[np.isnan(size_1d)] = 0
size_np = size_1d.astype(int)
size_df = pd.DataFrame(size_np)

In [9]:
#data with scale
data_full_list = [pd.DataFrame(zone_df), time_df, rate_df,size_df]
data_full_frame = pd.concat(data_full_list, axis=1, join='outer')
data_full = np.asarray(data_full_frame)

#data without scale
data_list = [pd.DataFrame(zone_df), time_df, rate_df]
data_frame = pd.concat(data_list, axis=1, join='outer')
data = np.asarray(data_frame)

y = np.asarray(y_rename)
y = np.ravel(y)

#split the data
x_full_train, x_full_test, y_full_train, y_full_test = train_test_split(data_full, y, test_size=0.20)
x_train, x_test, y_train, y_test = train_test_split(data, y, test_size=0.20)

# Regression Model

Baseline Regression Model

In [10]:
model_LR = LogisticRegression()
model_LR.fit(x_train, y_train)
predictions_LR = model_LR.predict(np.array((x_test)))
print("accuracy score:", accuracy_score(y_test, predictions_LR))
print("mean squared error:", mean_squared_error(y_test, predictions_LR))
adjusted_r_squared_LR = 1-(1-r2_score(y_test, predictions_LR))*((len(x_test)-1)/(len(x_test)-len(x_test[0])-1))
print("adjusted r squared:", adjusted_r_squared_LR)

accuracy score: 0.7883731981625218
mean squared error: 0.21162680183747823
adjusted r squared: -0.26973342499232067


Lasso Regression Model

In [11]:
model_lasso = Lasso()
model_lasso.fit(x_train, y_train)
model_lasso.score(x_test, y_test)

-0.0001141518617053805

Ridge Regression Model

In [12]:
model_reg = Ridge()
model_reg.fit(x_train, y_train)
model_reg.score(x_test, y_test)

0.0022032875171419386

# Classification Model

KNN

In [13]:
model_KNN = KNeighborsClassifier()
model_KNN.fit(x_train, y_train)
predictions_KNN = model_KNN.predict(np.array((x_test)))
print("accuracy score:", model_KNN.score(x_test, y_test))
print("mean squared error:", mean_squared_error(y_test, predictions_KNN))
adjusted_r_squared_KNN = 1-(1-r2_score(y_test, predictions_KNN))*((len(x_test)-1)/(len(x_test)-len(x_test[0])-1))
print("adjusted r squared:", adjusted_r_squared_KNN)

model_KNN = KNeighborsClassifier()
model_KNN.fit(x_full_train, y_full_train)
predictions_full_KNN = model_KNN.predict(np.array((x_full_test)))
print("accuracy score:", model_KNN.score(x_full_test, y_full_test))
print("mean squared error:", mean_squared_error(y_full_test, predictions_full_KNN))
adjusted_r_squared_KNN_full = 1-(1-r2_score(y_test, predictions_KNN))*((len(x_full_test)-1)/(len(x_full_test)-len(x_full_test[0])-1))
print("adjusted r squared:", adjusted_r_squared_KNN_full)

accuracy score: 0.7855219388563283
mean squared error: 0.2144780611436718
adjusted r squared: -0.28684061185598986
accuracy score: 0.8172025978140345
mean squared error: 0.18279740218596546
adjusted r squared: -0.28704461322121744


In [14]:
#loop training linespace->data frame

Decision Tree

In [15]:
model_DT = DecisionTreeClassifier()
model_DT.fit(x_train, y_train)
predictions_DT = model_DT.predict(np.array((x_test)))
print("accuracy score:", model_DT.score(x_test, y_test))
print("mean squared error:", mean_squared_error(y_test, predictions_DT))
adjusted_r_squared_DT = 1-(1-r2_score(y_test, predictions_DT))*((len(x_test)-1)/(len(x_test)-len(x_test[0])-1))
print("adjusted r squared:", adjusted_r_squared_DT)

model_DT = DecisionTreeClassifier()
model_DT.fit(x_full_train, y_full_train)
predictions_full_DT = model_DT.predict(np.array((x_full_test)))
print("accuracy score:", model_DT.score(x_full_test, y_full_test))
print("mean squared error:", mean_squared_error(y_full_test, predictions_full_DT))
adjusted_r_squared_DT_full = 1-(1-r2_score(y_test, predictions_DT))*((len(x_full_test)-1)/(len(x_full_test)-len(x_full_test[0])-1))
print("adjusted r squared:", adjusted_r_squared_DT_full)

accuracy score: 0.7886900047520988
mean squared error: 0.21130999524790114
adjusted r squared: -0.2678326264519131
accuracy score: 0.8335181371772533
mean squared error: 0.1664818628227467
adjusted r squared: -0.2680336145030311


SVM

In [16]:
model_SVM = svm.SVC()
model_SVM.fit(x_train, y_train)
predictions_SVM = model_SVM.predict(np.array((x_test)))
print("accuracy score:", model_SVM.score(x_test, y_test))
print("mean squared error:", mean_squared_error(y_test, predictions_SVM))
adjusted_r_squared_SVM = 1-(1-r2_score(y_test, predictions_SVM))*((len(x_test)-1)/(len(x_test)-len(x_test[0])-1))
print("adjusted r squared:", adjusted_r_squared_SVM)

model_SVM = svm.SVC()
model_SVM.fit(x_full_train, y_full_train)
predictions_full_SVM = model_SVM.predict(np.array((x_full_test)))
print("accuracy score:", model_SVM.score(x_full_test, y_full_test))
print("mean squared error:", mean_squared_error(y_full_test, predictions_full_SVM))
adjusted_r_squared_SVM_full = 1-(1-r2_score(y_test, predictions_SVM))*((len(x_full_test)-1)/(len(x_full_test)-len(x_full_test[0])-1))
print("adjusted r squared:", adjusted_r_squared_SVM_full)

accuracy score: 0.7885316014573103
mean squared error: 0.2114683985426897
adjusted r squared: -0.2687830257221169
accuracy score: 0.8034215111674323
mean squared error: 0.1965784888325677
adjusted r squared: -0.26898416443894035


Naive Bayes

In [17]:
model_NB = GaussianNB()
model_NB.fit(x_train, y_train)
predictions_NB = model_NB.predict(np.array((x_test)))
print("accuracy score:", model_NB.score(x_test, y_test))
print("mean squared error:", mean_squared_error(y_test, predictions_NB))
adjusted_r_squared_NB = 1-(1-r2_score(y_test, predictions_NB))*((len(x_test)-1)/(len(x_test)-len(x_test[0])-1))
print("adjusted r squared:", adjusted_r_squared_NB)

model_NB = GaussianNB()
model_NB.fit(x_full_train, y_full_train)
predictions_full_NB = model_NB.predict(np.array((x_full_test)))
print("accuracy score:", model_NB.score(x_full_test, y_full_test))
print("mean squared error:", mean_squared_error(y_full_test, predictions_full_NB))
adjusted_r_squared_NB_full = 1-(1-r2_score(y_test, predictions_NB))*((len(x_full_test)-1)/(len(x_full_test)-len(x_full_test[0])-1))
print("adjusted r squared:", adjusted_r_squared_NB_full)

accuracy score: 0.7885316014573103
mean squared error: 0.2114683985426897
adjusted r squared: -0.2687830257221169
accuracy score: 0.8023126881039125
mean squared error: 0.19768731189608743
adjusted r squared: -0.26898416443894035


# Fine Tune with GridSearch

Logistic Regression

In [18]:
#define models and parameters
model_LR = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

#define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold()
grid_search_LR = GridSearchCV(estimator=model_LR, param_grid=grid, cv=cv)
grid_search_LR.fit(x_train, y_train)
params = grid_search_LR.best_params_ 


best_model_LR = LogisticRegression(C = params.get('C'), penalty = params.get('penalty'), solver = params.get('solver'))
best_model_LR.fit(x_train, y_train)
predictions_LR_best = best_model_LR.predict(np.array((x_test)))

#summarize the performance
accuracy_score(y_test, predictions_LR_best)
print("mean squared error:", mean_squared_error(y_test, predictions_LR_best))
adjusted_r_squared_LR = 1-(1-r2_score(y_test, predictions_LR_best))*((len(x_test)-1)/(len(x_test)-len(x_test[0])-1))
print("adjusted r squared:", adjusted_r_squared_LR)

mean squared error: 0.21162680183747823
adjusted r squared: -0.26973342499232067


KNN

In [19]:
n_neighbors = list(range(1,30))
p=[1,2]
#Convert to dictionary
hyperparameters_KNN = dict(n_neighbors=n_neighbors, p=p)
#Create new KNN object
model_KNN_best = KNeighborsClassifier()
#Use GridSearch
clf_KNN = GridSearchCV(model_KNN_best, hyperparameters_KNN)
#Fit the model
best_model_KNN = clf_KNN.fit(x_train,y_train)
#Print The value of best Hyperparameters
print('Best p:', best_model_KNN.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model_KNN.best_estimator_.get_params()['n_neighbors'])

best_model_KNN = KNeighborsClassifier(n_neighbors=best_model_KNN.best_estimator_.get_params()['n_neighbors'], p=best_model_KNN.best_estimator_.get_params()['p'])
best_model_KNN.fit(x_train, y_train)
best_predictions_KNN = best_model_KNN.predict(np.array((x_test)))

print("accuracy score:", best_model_KNN.score(x_test, y_test))
print("mean squared error:", mean_squared_error(y_test, best_predictions_KNN))
adjusted_r_squared_KNN = 1-(1-r2_score(y_test, best_predictions_KNN))*((len(x_test)-1)/(len(x_test)-len(x_test[0])-1))
print("adjusted r squared:", adjusted_r_squared_KNN)

Best p: 2
Best n_neighbors: 18
accuracy score: 0.7871059718042135
mean squared error: 0.21289402819578648
adjusted r squared: -0.27733661915395147


Decision Tree

In [21]:
hyperparameters_DT = {"splitter":["best","random"],
            "max_depth" : [1,3,5,7,9,11],
           "min_samples_leaf":[1,3,5,7,9],
           "max_features":["auto","log2","sqrt",None]}
#Create new DT object
model_DT_best = DecisionTreeClassifier()
#Use GridSearch
clf_DT = GridSearchCV(model_DT_best,param_grid=hyperparameters_DT,scoring='neg_mean_squared_error',cv=3,verbose=3)
#Fit the model
best_model_DT = clf_DT.fit(x_train,y_train)
#Print The value of best Hyperparameters
print('Best splitter:', best_model_DT.best_estimator_.get_params()["splitter"])
print('Best max_depth:', best_model_DT.best_estimator_.get_params()["max_depth"])
print('Best min_samples_leaf:', best_model_DT.best_estimator_.get_params()["min_samples_leaf"])
print('Best max_feature:', best_model_DT.best_estimator_.get_params()["max_features"])

best_model_DT = DecisionTreeClassifier(splitter=best_model_DT.best_estimator_.get_params()['splitter'], 
                                      max_depth=best_model_DT.best_estimator_.get_params()['max_depth'],
                                      min_samples_leaf=best_model_DT.best_estimator_.get_params()['min_samples_leaf'],
                                      max_features=best_model_DT.best_estimator_.get_params()['max_features'])
best_model_DT.fit(x_train, y_train)
best_predictions_DT = best_model_DT.predict(np.array((x_test)))

print("accuracy score:", best_model_DT.score(x_test, y_test))
print("mean squared error:", mean_squared_error(y_test, best_predictions_DT))
adjusted_r_squared_DT = 1-(1-r2_score(y_test, best_predictions_DT))*((len(x_test)-1)/(len(x_test)-len(x_test[0])-1))
print("adjusted r squared:", adjusted_r_squared_DT)

Fitting 3 folds for each of 240 candidates, totalling 720 fits
[CV 1/3] END max_depth=1, max_features=auto, min_samples_leaf=1, splitter=best;, score=nan total time=   0.0s
[CV 2/3] END max_depth=1, max_features=auto, min_samples_leaf=1, splitter=best;, score=nan total time=   0.0s
[CV 3/3] END max_depth=1, max_features=auto, min_samples_leaf=1, splitter=best;, score=nan total time=   0.0s
[CV 1/3] END max_depth=1, max_features=auto, min_samples_leaf=1, splitter=random;, score=nan total time=   0.0s
[CV 2/3] END max_depth=1, max_features=auto, min_samples_leaf=1, splitter=random;, score=nan total time=   0.0s
[CV 3/3] END max_depth=1, max_features=auto, min_samples_leaf=1, splitter=random;, score=nan total time=   0.0s
[CV 1/3] END max_depth=1, max_features=auto, min_samples_leaf=3, splitter=best;, score=nan total time=   0.0s
[CV 2/3] END max_depth=1, max_features=auto, min_samples_leaf=3, splitter=best;, score=nan total time=   0.0s
[CV 3/3] END max_depth=1, max_features=auto, min_sa

[CV 1/3] END max_depth=3, max_features=auto, min_samples_leaf=3, splitter=best;, score=nan total time=   0.0s
[CV 2/3] END max_depth=3, max_features=auto, min_samples_leaf=3, splitter=best;, score=nan total time=   0.0s
[CV 3/3] END max_depth=3, max_features=auto, min_samples_leaf=3, splitter=best;, score=nan total time=   0.0s
[CV 1/3] END max_depth=3, max_features=auto, min_samples_leaf=3, splitter=random;, score=nan total time=   0.0s
[CV 2/3] END max_depth=3, max_features=auto, min_samples_leaf=3, splitter=random;, score=nan total time=   0.0s
[CV 3/3] END max_depth=3, max_features=auto, min_samples_leaf=3, splitter=random;, score=nan total time=   0.0s
[CV 1/3] END max_depth=3, max_features=auto, min_samples_leaf=5, splitter=best;, score=nan total time=   0.0s
[CV 2/3] END max_depth=3, max_features=auto, min_samples_leaf=5, splitter=best;, score=nan total time=   0.0s
[CV 3/3] END max_depth=3, max_features=auto, min_samples_leaf=5, splitter=best;, score=nan total time=   0.0s
[CV 

[CV 2/3] END max_depth=5, max_features=auto, min_samples_leaf=1, splitter=best;, score=nan total time=   0.0s
[CV 3/3] END max_depth=5, max_features=auto, min_samples_leaf=1, splitter=best;, score=nan total time=   0.0s
[CV 1/3] END max_depth=5, max_features=auto, min_samples_leaf=1, splitter=random;, score=nan total time=   0.0s
[CV 2/3] END max_depth=5, max_features=auto, min_samples_leaf=1, splitter=random;, score=nan total time=   0.0s
[CV 3/3] END max_depth=5, max_features=auto, min_samples_leaf=1, splitter=random;, score=nan total time=   0.0s
[CV 1/3] END max_depth=5, max_features=auto, min_samples_leaf=3, splitter=best;, score=nan total time=   0.0s
[CV 2/3] END max_depth=5, max_features=auto, min_samples_leaf=3, splitter=best;, score=nan total time=   0.0s
[CV 3/3] END max_depth=5, max_features=auto, min_samples_leaf=3, splitter=best;, score=nan total time=   0.0s
[CV 1/3] END max_depth=5, max_features=auto, min_samples_leaf=3, splitter=random;, score=nan total time=   0.0s
[C

[CV 2/3] END max_depth=5, max_features=None, min_samples_leaf=3, splitter=best;, score=-0.207 total time=   0.0s
[CV 3/3] END max_depth=5, max_features=None, min_samples_leaf=3, splitter=best;, score=-0.207 total time=   0.0s
[CV 1/3] END max_depth=5, max_features=None, min_samples_leaf=3, splitter=random;, score=-0.207 total time=   0.0s
[CV 2/3] END max_depth=5, max_features=None, min_samples_leaf=3, splitter=random;, score=-0.207 total time=   0.0s
[CV 3/3] END max_depth=5, max_features=None, min_samples_leaf=3, splitter=random;, score=-0.207 total time=   0.0s
[CV 1/3] END max_depth=5, max_features=None, min_samples_leaf=5, splitter=best;, score=-0.206 total time=   0.0s
[CV 2/3] END max_depth=5, max_features=None, min_samples_leaf=5, splitter=best;, score=-0.207 total time=   0.0s
[CV 3/3] END max_depth=5, max_features=None, min_samples_leaf=5, splitter=best;, score=-0.207 total time=   0.0s
[CV 1/3] END max_depth=5, max_features=None, min_samples_leaf=5, splitter=random;, score=-

[CV 1/3] END max_depth=7, max_features=sqrt, min_samples_leaf=7, splitter=best;, score=-0.207 total time=   0.0s
[CV 2/3] END max_depth=7, max_features=sqrt, min_samples_leaf=7, splitter=best;, score=-0.208 total time=   0.0s
[CV 3/3] END max_depth=7, max_features=sqrt, min_samples_leaf=7, splitter=best;, score=-0.207 total time=   0.0s
[CV 1/3] END max_depth=7, max_features=sqrt, min_samples_leaf=7, splitter=random;, score=-0.207 total time=   0.0s
[CV 2/3] END max_depth=7, max_features=sqrt, min_samples_leaf=7, splitter=random;, score=-0.207 total time=   0.0s
[CV 3/3] END max_depth=7, max_features=sqrt, min_samples_leaf=7, splitter=random;, score=-0.207 total time=   0.0s
[CV 1/3] END max_depth=7, max_features=sqrt, min_samples_leaf=9, splitter=best;, score=-0.207 total time=   0.0s
[CV 2/3] END max_depth=7, max_features=sqrt, min_samples_leaf=9, splitter=best;, score=-0.207 total time=   0.0s
[CV 3/3] END max_depth=7, max_features=sqrt, min_samples_leaf=9, splitter=best;, score=-0.

[CV 1/3] END max_depth=9, max_features=log2, min_samples_leaf=5, splitter=best;, score=-0.206 total time=   0.0s
[CV 2/3] END max_depth=9, max_features=log2, min_samples_leaf=5, splitter=best;, score=-0.208 total time=   0.0s
[CV 3/3] END max_depth=9, max_features=log2, min_samples_leaf=5, splitter=best;, score=-0.208 total time=   0.0s
[CV 1/3] END max_depth=9, max_features=log2, min_samples_leaf=5, splitter=random;, score=-0.207 total time=   0.0s
[CV 2/3] END max_depth=9, max_features=log2, min_samples_leaf=5, splitter=random;, score=-0.207 total time=   0.0s
[CV 3/3] END max_depth=9, max_features=log2, min_samples_leaf=5, splitter=random;, score=-0.207 total time=   0.0s
[CV 1/3] END max_depth=9, max_features=log2, min_samples_leaf=7, splitter=best;, score=-0.207 total time=   0.0s
[CV 2/3] END max_depth=9, max_features=log2, min_samples_leaf=7, splitter=best;, score=-0.208 total time=   0.0s
[CV 3/3] END max_depth=9, max_features=log2, min_samples_leaf=7, splitter=best;, score=-0.

[CV 1/3] END max_depth=11, max_features=log2, min_samples_leaf=3, splitter=best;, score=-0.207 total time=   0.0s
[CV 2/3] END max_depth=11, max_features=log2, min_samples_leaf=3, splitter=best;, score=-0.207 total time=   0.0s
[CV 3/3] END max_depth=11, max_features=log2, min_samples_leaf=3, splitter=best;, score=-0.209 total time=   0.0s
[CV 1/3] END max_depth=11, max_features=log2, min_samples_leaf=3, splitter=random;, score=-0.207 total time=   0.0s
[CV 2/3] END max_depth=11, max_features=log2, min_samples_leaf=3, splitter=random;, score=-0.207 total time=   0.0s
[CV 3/3] END max_depth=11, max_features=log2, min_samples_leaf=3, splitter=random;, score=-0.207 total time=   0.0s
[CV 1/3] END max_depth=11, max_features=log2, min_samples_leaf=5, splitter=best;, score=-0.206 total time=   0.0s
[CV 2/3] END max_depth=11, max_features=log2, min_samples_leaf=5, splitter=best;, score=-0.207 total time=   0.0s
[CV 3/3] END max_depth=11, max_features=log2, min_samples_leaf=5, splitter=best;, 

180 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\周宜纬\AppData\Roaming\Python\Python311\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\周宜纬\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\周宜纬\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\周宜纬\AppData\Roaming\Python\Python311\site-packages\sklearn\utils\_param_validation.py"

SVM

In [None]:
hyperparameters_SVM = {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000],
                       'gamma': [0.001, 0.0001]}    
#Create new SVM object
model_SVM_best = svm.SVC()
#Use GridSearch
clf_SVM = GridSearchCV(model_SVM_best, hyperparameters_SVM)
#Fit the model
best_model_SVM = clf_SVM.fit(x_train, y_train)
#Print The value of best Hyperparameters
print('C:', best_model_DT.best_estimator.get_params()["C"])
print('gamma:', best_model_DT.best_estimator_.get_params()["gamma"])

best_model_SVM = svm.SVC(C=best_model_DT.best_estimator_.get_params()['C'], 
                         gamma=best_model_DT.best_estimator_.get_params()['gamma'])
best_model_SVM.fit(x_train, y_train)
best_predictions_SVM = model_SVM.predict(np.array((x_test)))

print("accuracy score:", best_model_SVM.score(x_test, y_test))
print("mean squared error:", mean_squared_error(y_test, best_predictions_SVM))
adjusted_r_squared_SVM = 1-(1-r2_score(y_test, best_predictions_SVM))*((len(x_test)-1)/(len(x_test)-len(x_test[0])-1))
print("adjusted r squared:", adjusted_r_squared_SVM)

Naive Bayes

In [None]:
hyperparameters_NB = {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]}    
#Create new SVM object
model_NB_best = GaussianNB()
#Use GridSearch
clf_NB = GridSearchCV(model_NB_best, hyperparameters_NB)
#Fit the model
best_model_NB = clf_NB.fit(x_train, y_train)
#Print The value of best Hyperparameters
print('var_smoothing:', best_model_NB.best_estimator_.get_params()["var_smoothing"])

best_model_NB = GaussianNB(var_smoothing=best_model_NB.best_estimator_.get_params()['var_smoothing'])
best_model_NB.fit(x_train, y_train)
best_predictions_NB = best_model_NB.predict((x_test))

print("accuracy score:", best_model_NB.score(x_test, y_test))
print("mean squared error:", mean_squared_error(y_test, best_predictions_NB))
adjusted_r_squared_NB = 1-(1-r2_score(y_test, best_predictions_NB))*((len(x_test)-1)/(len(x_test)-len(x_test[0])-1))
print("adjusted r squared:", adjusted_r_squared_NB)