In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
# import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold, RepeatedKFold
from scipy import sparse

# 显示所有列
pd.set_option('display.max_columns', None)

# 显示所有行
pd.set_option('display.max_rows', None)

from datetime import datetime

补充：
1. OneHotEncoder --> 独热编码
2. 原理：使用 n 位状态寄存器来对 n 个状态进行编码。每个状态有自己的独立寄存器位，并且在任意时候只有一位有效
3. 再细：每个整数值被划分为二进制，如：特征值为0，1；那么样本编码：[0,0]，[0,1]，[1,0]，[1,1]
4. 缺点：特征可能比较稀疏 ---> 如果特征值很大，例如特征值大小为64的话，那么2的八次方，就会有些值的编码中只有1个1。
        （不过缺点更在于不适于文本类编码）

In [None]:
# 导入数据

train_abbr = pd.read_csv('happiness_train_abbr.csv', encoding='ISO-8859-1')
train = pd.read_csv('happiness_train_complete.csv', encoding='ISO-8859-1')
test_abbr = pd.read_csv('happiness_test_abbr.csv', encoding='ISO-8859-1')
test = pd.read_csv('happiness_test_complete.csv', encoding='ISO-8859-1')
test_sub = pd.read_csv('happiness_submit.csv', encoding='ISO-8859-1')

In [None]:
test.shape

In [None]:
test_sub.shape

In [None]:
train.shape

In [None]:
train.head()

 info函数可以打印一些有关数据表的相关信息
 参数；
    verbose：是否打印完整的表
    buf：输出到哪里 缓冲区or stdout
    max_col/memory_useage
    null_counts: 是否显示非空计数

In [None]:
train.info(verbose=True, null_counts=True)

In [None]:
y_train_ = train["happiness"]
y_train_.value_counts()

map函数，一般用法： map(fun, series)
作用：映射，将fun作用于数据中某一列

lambda：匿名函数声明？

In [None]:
y_train_ = y_train_.map(lambda x:3 if x==-8 else x)

In [None]:
y_train_=y_train_.map(lambda x:x-1)

In [None]:
# concat可以连接两个数据集，这里将test与train连在了一起

data = pd.concat([train,test], axis=0, ignore_index=True)

In [None]:
data.shape

In [None]:
# 处理数据开始

# 处理时间

data['survey_time'] = pd.to_datetime(data['survey_time'], format='%Y-%m-%d %H:%M:%S')
data["weekday"]=data["survey_time"].dt.weekday
data["year"]=data["survey_time"].dt.year
data["quarter"]=data["survey_time"].dt.quarter
data["hour"]=data["survey_time"].dt.hour
data["month"]=data["survey_time"].dt.month

In [None]:
#把一天的时间分段
def hour_cut(x):
    if 0<=x<6:
        return 0
    elif  6<=x<8:
        return 1
    elif  8<=x<12:
        return 2
    elif  12<=x<14:
        return 3
    elif  14<=x<18:
        return 4
    elif  18<=x<21:
        return 5
    elif  21<=x<24:
        return 6

    
data["hour_cut"]=data["hour"].map(hour_cut)

In [None]:
data["survey_age"]=data["year"]-data["birth"]

In [None]:
data["happiness"]=data["happiness"].map(lambda x:x-1)

In [None]:
data=data.drop(["edu_other"], axis=1)
data=data.drop(["happiness"], axis=1)
data=data.drop(["survey_time"], axis=1)

In [None]:
data["join_party"]=data["join_party"].map(lambda x:0 if pd.isnull(x)  else 1)

In [None]:
def birth_split(x):
    if 1920<=x<=1930:
        return 0
    elif  1930<x<=1940:
        return 1
    elif  1940<x<=1950:
        return 2
    elif  1950<x<=1960:
        return 3
    elif  1960<x<=1970:
        return 4
    elif  1970<x<=1980:
        return 5
    elif  1980<x<=1990:
        return 6
    elif  1990<x<=2000:
        return 7
    
data["birth_s"]=data["birth"].map(birth_split)

In [None]:
def income_cut(x):
    if x<0:
        return 0
    elif  0<=x<1200:
        return 1
    elif  1200<x<=10000:
        return 2
    elif  10000<x<24000:
        return 3
    elif  24000<x<40000:
        return 4
    elif  40000<=x:
        return 5
 

    
data["income_cut"]=data["income"].map(income_cut)

In [None]:
data["edu_status"]=data["edu_status"].fillna(5)
data["edu_yr"]=data["edu_yr"].fillna(-2)
data["property_other"]=data["property_other"].map(lambda x:0 if pd.isnull(x)  else 1)
data["hukou_loc"]=data["hukou_loc"].fillna(1)
data["social_neighbor"]=data["social_neighbor"].fillna(8)
data["social_friend"]=data["social_friend"].fillna(8)
data["work_status"]=data["work_status"].fillna(0)
data["work_yr"]=data["work_yr"].fillna(0)
data["work_type"]=data["work_type"].fillna(0)
data["work_manage"]=data["work_manage"].fillna(0)
data["family_income"]=data["family_income"].fillna(-2)
data["invest_other"]=data["invest_other"].map(lambda x:0 if pd.isnull(x)  else 1)

In [None]:
data["minor_child"]=data["minor_child"].fillna(0)
data["marital_1st"]=data["marital_1st"].fillna(0)
data["s_birth"]=data["s_birth"].fillna(0)
data["marital_now"]=data["marital_now"].fillna(0)
data["s_edu"]=data["s_edu"].fillna(0)
data["s_political"]=data["s_political"].fillna(0)
data["s_hukou"]=data["s_hukou"].fillna(0)
data["s_income"]=data["s_income"].fillna(0)
data["s_work_exper"]=data["s_work_exper"].fillna(0)
data["s_work_status"]=data["s_work_status"].fillna(0)
data["s_work_type"]=data["s_work_type"].fillna(0)

In [None]:
data=data.drop(["id"], axis=1)

In [None]:
X_train_ = data[:train.shape[0]]
X_test_  = data[train.shape[0]:]

In [None]:
target_column = 'happiness'
feature_columns=list(X_test_.columns) 
feature_columns

In [None]:
X_train = np.array(X_train_)
y_train = np.array(y_train_)
X_test  = np.array(X_test_)

In [None]:
# from tpot import TPOTRegressor

# tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
# tpot.fit(X_train, y_train)
# print(tpot.score(X_train, y_train))
# tpot.export('tpot_boston_pipeline.py')

In [None]:
# xgboost算法
import xgboost as xgb
X_train = np.array(X_train_)
y_train = np.array(y_train_)
X_test  = np.array(X_test_)
model = xgb.XGBRegressor(max_depth=3, num_boost_round=2000, learning_rate=0.1, 
                         min_child_weight=2, n_estimators=100, n_jobs=1, 
                         subsample=0.65, verbosity=0)
model.fit(X_train, y_train)
Y_pre = model.predict(X_test)
y_new_prd = []
for i in Y_pre:
    if(i > 5):
        i = 5
    y_new_prd.append(i)

result = list(y_new_prd)
result = list(map(lambda x: x+1, result))
test_sub["happiness"] = result
test_sub.to_csv("submit.csv")

In [None]:
df = pd.read_csv('submit.csv')
print(df)
df.drop(columns=0, axis=1)