In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

import pandas as pd

In [5]:
train_data_file = 'data/zhengqi_train.txt'
test_data_file = 'data/zhengqi_test.txt'

train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')

train_data = train_data[train_data['V9']>-7.5]
test_data = test_data[test_data['V9']>-7.5]

from sklearn.decomposition import PCA
from sklearn import preprocessing

feature_columns = [col for col in train_data.columns if col not in ['target']]
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler = min_max_scaler.fit(train_data[feature_columns])

train_data_scaler = min_max_scaler.transform(train_data[feature_columns])
test_data_scaler = min_max_scaler.transform(test_data[feature_columns])

train_data_scaler = pd.DataFrame(train_data_scaler)
train_data_scaler.columns = feature_columns
train_data_scaler['target'] = train_data['target']

test_data_scaler = pd.DataFrame(test_data_scaler)
test_data_scaler.columns = feature_columns

pca = PCA(n_components=16)
new_train_pca_16 = pca.fit_transform(train_data_scaler.iloc[:, 0:-1])
new_test_pca_16 = pca.transform(test_data_scaler)
new_train_pca_16 = pd.DataFrame(new_train_pca_16)
new_test_pca_16 = pd.DataFrame(new_test_pca_16)

new_train_pca_16['target'] = train_data_scaler['target']
new_train_pca_16.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,target
count,2886.0,2886.0,2886.0,2886.0,2886.0,2886.0,2886.0,2886.0,2886.0,2886.0,2886.0,2886.0,2886.0,2886.0,2886.0,2886.0,2884.0
mean,-3.9392530000000004e-17,-1.9696260000000003e-17,2.9544400000000004e-17,-5.908879000000001e-17,7.386099e-17,-2.708236e-17,5.908879000000001e-17,5.047168e-17,-4.9240660000000004e-17,8.740217e-17,9.848132e-18,1.280257e-16,-1.538771e-17,-5.293371e-17,-9.848132e-18,1.335653e-16,0.127274
std,0.3998976,0.350024,0.2938631,0.2728023,0.2077128,0.1951842,0.1877104,0.160767,0.1512707,0.1443772,0.136879,0.1286191,0.1193301,0.1149757,0.1133507,0.1019258,0.983462
min,-1.071795,-0.9429479,-0.9948314,-0.7103086,-0.7703985,-0.5340284,-0.5993764,-0.5870792,-0.6283013,-0.4902477,-0.6340768,-0.5905543,-0.417572,-0.4309496,-0.4170885,-0.3602942,-3.044
25%,-0.2804085,-0.2613727,-0.2090798,-0.1945196,-0.1315623,-0.1264092,-0.1236355,-0.1016389,-0.09663332,-0.09297162,-0.08202524,-0.07722117,-0.07140315,-0.07490259,-0.07712204,-0.06605996,-0.3485
50%,-0.01417104,-0.01277241,0.02112166,-0.02337395,-0.005122861,-0.01355346,-0.0001746628,-0.004656764,0.002574516,-0.001475378,0.00728976,-0.005671208,-0.004157889,0.001042171,-0.001738348,-0.0007629364,0.313
75%,0.2287306,0.231772,0.2069571,0.165759,0.1281656,0.0999311,0.1272074,0.09657091,0.1002793,0.09059699,0.08833988,0.07148089,0.06778119,0.07575507,0.07118075,0.06354093,0.79425
max,1.59773,1.382802,1.01025,1.448007,1.034062,1.358963,0.61916,0.7370014,0.6447869,0.5839577,0.6405349,0.6779764,0.5157227,0.4978796,0.4673081,0.4570815,2.538


Split Data

In [6]:
new_train_pca_16 = new_train_pca_16.fillna(0)
train = new_train_pca_16[new_test_pca_16.columns]
target = new_train_pca_16['target']

train_data, test_data, train_target, test_target = train_test_split(train, target, test_size=0.2, random_state=0)

Linear Regression

In [7]:
clf = LinearRegression()
clf.fit(train_data, train_target)
score = mean_squared_error(test_target, clf.predict(test_data))
print("LinearRegression: ", score)

LinearRegression:  0.27170117426273427


K-Neighbors Regression

In [8]:
clf = KNeighborsRegressor(n_neighbors=8)
clf.fit(train_data, train_target)
score = mean_squared_error(test_target, clf.predict(test_data))
print("KNeighborsRegression: ", score)

KNeighborsRegression:  0.2730478217182093


Random Forest Regression

In [9]:
clf = RandomForestRegressor()
clf.fit(train_data, train_target)
score = mean_squared_error(test_target, clf.predict(test_data))
print("RandomForestRegressor: ", score)

RandomForestRegressor:  0.2550231710032872


LGB Regression

In [10]:
import lightgbm as lgb
clf = lgb.LGBMRegressor(learning_rate=0.01, max_depth=-1,
                        n_estimators=5000, boosting_type='gbdt',
                        random_state=2021, objective='regression')

clf.fit(X=train_data, y=train_target, eval_metric='MSE', verbose=50)

score = mean_squared_error(test_target, clf.predict(test_data))
print("LightGBM: ", score)

LightGBM:  0.24493711836175286
