In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import time
import warnings
warnings.filterwarnings('ignore') 

In [58]:
df = pd.read_csv('../3-LinearRegression/data/location_rev.csv')
df.head()

Unnamed: 0,revenue,num_competitors,median_income,num_loyalty_members,population_density,location_age
0,42247.8,3.0,30527.57,1407.0,3302.0,12.0
1,38628.37,3.0,30185.49,1025.0,4422.0,11.0
2,39715.16,1.0,32182.24,1498.0,3260.0,12.0
3,35593.3,5.0,29728.65,2340.0,4325.0,10.0
4,35128.18,4.0,30691.17,847.0,3774.0,11.0


In [59]:
X_col = ['num_competitors', 'median_income', 'num_loyalty_members', 'population_density', 'location_age']
X = df[X_col]

#Chuẩn hóa
scaler = StandardScaler()
X_scale = scaler.fit_transform(X)

y = df[["revenue"]]

scaler = StandardScaler()
y_scale = scaler.fit_transform(y)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X_scale, y_scale,test_size=0.2)

In [72]:
# Liệt kê các model sửng dụng
models = [SVR(), LinearRegression(), KNeighborsRegressor(), DecisionTreeRegressor(), RandomForestRegressor()]
grid_params = [
    #SVR
    {
        'C': [0.1, 1, 10, 100, 1000,1000],
        'epsilon': [0.1, 1],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto'] 
    },
    #Linear Regression
    {
        'fit_intercept': [True, False],
        'positive': [True, False],
        'copy_X': [True, False]
    },
    # KNN
    {
        'n_neighbors' : [3,5,7,9,11,13,15],
        'weights' : ['uniform','distance'],
        'metric' : ['minkowski','euclidean','manhattan']
    },
    # Decision Tree
    {
        'max_depth': [5, 10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 5, 10],
        'max_features': ['auto', 'sqrt', 'log2']
    },
    # Random Forest
    {
        'n_estimators': [50, 100, 200, 500],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2']
    }
]

scores = []
train_times = []
names = []
best_params_ = []

for i in range(len(models)):
    model, param_grid = models[i], grid_params[i]
    
    grid_search = GridSearchCV(model, param_grid, cv=10, scoring='r2')
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    
    best_params_.append(best_params)
    
    # Create a new instance of the model with the best parameters
    best_model = model.__class__(**best_params)
    
    start = time.time()
    score = cross_val_score(best_model, X_train, y_train, scoring="r2", cv=10)
    scores.append(score.mean())
    end = time.time()
    train_times.append(end - start)
    names.append(model.__class__.__name__)

df = pd.DataFrame(scores, columns=['Score'], index=range(len(models)))
df.insert(1, 'Time', pd.Series(train_times))
df.insert(0, 'Model', pd.Series(names))
df.head(10)

Unnamed: 0,Model,Score,Time
0,SVR,0.808234,0.211649
1,LinearRegression,0.810059,0.014087
2,KNeighborsRegressor,0.725609,0.0273
3,DecisionTreeRegressor,0.533878,0.021315
4,RandomForestRegressor,0.74767,6.847184


In [62]:
# Define the parameter grid
param_grid = {
    'fit_intercept': [True, False],
    'positive': [True, False],
    'copy_X': [True, False]
}

# Create an instance of the model
model = LinearRegression()

# Create GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=10, scoring='r2')

# Perform the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters:", best_params)

Best parameters: {'copy_X': True, 'fit_intercept': False, 'positive': False}


In [None]:
# Create a new instance of the Linear Regression model with the best parameters
best_model = LinearRegression(**best_params)

# Train the model on the entire training set
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate R-squared score
r2 = r2_score(y_test, y_pred)
print("R-squared score:", r2)

### Tạo Pipeline

In [75]:
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
model_pipeline = Pipeline(steps=[('Scale_data',StandardScaler()),
                                 ('SVM model', SVR(kernel = 'linear', C=1000))])

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)

model_pipeline.fit(X_train, y_train)

y_pred = model_pipeline.predict(X_test)
print(model_pipeline.score(X_train,y_train))
print(model_pipeline.score(X_test,y_test))

0.8128204603566682
0.8089040226182714
