In [83]:
import datetime
from sklearn.base import clone
from sklearn.model_selection import KFold,train_test_split
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import r2_score
from colorama import Fore, Style
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, SplineTransformer, OneHotEncoder
from sklearn.linear_model import Ridge, LinearRegression
import matplotlib.pyplot as plt
import copy

In [56]:
train = pd.read_csv('dataset/train.csv', index_col='id', dtype='str')
test = pd.read_csv('dataset/test.csv', index_col='id', dtype='str')
#train = pd.read_csv('/kaggle/input/playground-series-s4e5/train.csv', index_col='id')
#test = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv', index_col='id')

initial_features = list(train.columns)[:-1]

y = train["FloodProbability"].copy().astype(float)
train = train.drop(columns=["FloodProbability"])

# 将训练集和测试集合并
combined_df = pd.concat([train, test], keys=['train', 'test'])

# 对合并后的 DataFrame 进行独热编码
combined_df = pd.get_dummies(combined_df,dtype=bool)

# 分离训练集和测试集
train = combined_df.xs('train')
test = combined_df.xs('test')
# for df in [train, test]:
#     df['fsum'] = df.sum(axis=1) # for tree models

X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, shuffle=True, random_state=42)

In [57]:
X_train

Unnamed: 0_level_0,MonsoonIntensity_0,MonsoonIntensity_1,MonsoonIntensity_10,MonsoonIntensity_11,MonsoonIntensity_12,MonsoonIntensity_13,MonsoonIntensity_14,MonsoonIntensity_15,MonsoonIntensity_16,MonsoonIntensity_2,...,PoliticalFactors_15,PoliticalFactors_16,PoliticalFactors_2,PoliticalFactors_3,PoliticalFactors_4,PoliticalFactors_5,PoliticalFactors_6,PoliticalFactors_7,PoliticalFactors_8,PoliticalFactors_9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
805665,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
863045,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
496045,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1079364,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
191800,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110268,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
259178,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
131932,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
671155,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False


In [58]:
def get_summ_info(df):
    print(f'data shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values
    summ['%missing'] = df.isnull().sum().values / len(df) * 100
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
#     summ['first value'] = df.loc[0].values
#     summ['second value'] = df.loc[1].values
#     summ['third value'] = df.loc[2].values
    return summ.style.background_gradient(cmap='Blues')

In [59]:
import torch
from torch.optim import Adam,SGD
from torch.utils.data import DataLoader

In [72]:
class Model(torch.nn.Module):
    def __init__(self,in_=357):
        super().__init__()
        self.bn = torch.nn.BatchNorm1d(in_)
        self.dense1 = torch.nn.Linear(in_,64,)
        self.relu = torch.nn.ReLU()
        self.dense2 = torch.nn.Linear(64,32)
        self.relu2 = torch.nn.ReLU()
        self.output = torch.nn.Linear(32,1)
        self.softmax = torch.nn.Softmax()
    def forward(self,x):
        x = self.bn(x)
        x = self.relu(self.dense1(x))
        x = self.relu2(self.dense2(x))
        x = self.output(x)
        return x

In [73]:
from torch.utils.data import Dataset, DataLoader
class CustomDataset(Dataset):
    def __init__(self, X, y):
        # 在这里将pandas的数据转换为张量
        self.X = torch.tensor(X.values, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32)
    
    def __getitem__(self, index):
        # 运行数据加载器时，会返回下面的值
        X = self.X[index]
        y = self.y[index]
        return X, y
    
    def __len__(self):
        # 返回数据集的大小
        return len(self.y)

In [74]:
def foo_r2loss(y_pred, y_true):
    #y_true = torch.tensor(y_true, dtype=torch.float64)
    #y_pred = torch.tensor(y_pred, dtype=torch.float64)
    
    numerator = torch.sum((y_true - y_pred) ** 2, dim=0, dtype=torch.float64)
    mean_y_true = torch.mean(y_true, dim=0)
    denominator = torch.sum((y_true - mean_y_true) ** 2, dim=0, dtype=torch.float64)
    
    output_scores =  (numerator / denominator)
    return output_scores

In [75]:
y_train

id
805665     0.575
863045     0.400
496045     0.505
1079364    0.455
191800     0.535
           ...  
110268     0.450
259178     0.545
131932     0.420
671155     0.485
121958     0.505
Name: FloodProbability, Length: 894365, dtype: float64

In [76]:
torch.random.manual_seed(420)
net = Model()
criterion = torch.nn.MSELoss()
dataset = CustomDataset(X_train, y_train)
batch_data = DataLoader(dataset=dataset, batch_size=4096, shuffle=True, drop_last=False)

In [80]:
opt = Adam(net.parameters(), lr=0.0001)
for epoch in range(3):
    for n_,batch in enumerate(batch_data): 
        X = batch[0].view(batch[0].shape[0],-1)
        y = batch[1].view(batch[1].shape[0],1)
        sigma = net.forward(X)
        loss = criterion(sigma, y)
        if n_%20 == 0:
            print('r2_score',r2_score(y, sigma.detach().numpy()))
        loss.backward()
        opt.step()
        net.zero_grad()

r2_score 0.8337230517688591
r2_score 0.8401342415957478
r2_score 0.8317076685030351
r2_score 0.8440985166633351
r2_score 0.8521448583019978
r2_score 0.8497994826343065
r2_score 0.8410965471967764
r2_score 0.843531976920128
r2_score 0.846363525463998
r2_score 0.839063156657485
r2_score 0.8415433257407855
r2_score 0.8434213670815558
r2_score 0.8453984175075019
r2_score 0.8320193613423756
r2_score 0.8447067419903779
r2_score 0.8368709218487665
r2_score 0.8419219963177205
r2_score 0.8499510798833713
r2_score 0.8454952309525402
r2_score 0.8452900990843027
r2_score 0.8449250367458611
r2_score 0.8476970154228414
r2_score 0.8564415526005635
r2_score 0.851388605559616
r2_score 0.8499590279499717
r2_score 0.8428017035171601
r2_score 0.8390231796154439
r2_score 0.8470376187158518
r2_score 0.8463404014559857
r2_score 0.8405152194254434
r2_score 0.8436475550612907
r2_score 0.856390231882363
r2_score 0.8425593385461735


In [81]:
X = torch.Tensor(X_test.values).view(X_test.shape[0],-1)
y = torch.Tensor(y_test).view(y_test.shape[0],1)
sigma = net.forward(X).detach().numpy()
print('r2_score',r2_score(y, sigma))

r2_score 0.8278730362896654


In [67]:
torch.random.manual_seed(420)
net = Model()
criterion = torch.nn.MSELoss()
dataset = CustomDataset(train[columns],  train["FloodProbability"])
batch_data = DataLoader(dataset=dataset, batch_size=4096, shuffle=True, drop_last=False)

NameError: name 'columns' is not defined

In [87]:
opt = Adam(net.parameters(), lr=0.001)
for n_ in range(23):
    for n_,batch in enumerate(batch_data): 
        X = batch[0].view(batch[0].shape[0],-1)
        y = batch[1].view(batch[1].shape[0],1)
        sigma = net.forward(X)
        loss = criterion(sigma, y)
        if n_%20 == 0:
            print('r2_score',r2_score(y, sigma.detach().numpy()))
        loss.backward()
        opt.step()
        net.zero_grad()

r2_score -0.3207198579219146
r2_score 0.12075035690080937
r2_score 0.4184254113747212
r2_score 0.5752650486949066
r2_score 0.652625406343373
r2_score 0.6932605558518801
r2_score 0.7159495413334735
r2_score 0.7391243958979741
r2_score 0.7553600612774417
r2_score 0.7654854082144198
r2_score 0.7525181873188986
r2_score 0.7573950148965083
r2_score 0.7787068497525405
r2_score 0.7705463911685041
r2_score 0.7866619211834056
r2_score 0.7934847298435523
r2_score 0.7932357741797107
r2_score 0.7978773446547238
r2_score 0.7959349267230253
r2_score 0.8000249231568716
r2_score 0.7821352635555108
r2_score 0.8161662855492247
r2_score 0.8023671139893425
r2_score 0.8153144165044386
r2_score 0.8148925785581114
r2_score 0.7932292921709312
r2_score 0.8088222805482193
r2_score 0.8145052374608371
r2_score 0.8178091164818673
r2_score 0.8133255031669344
r2_score 0.8211194792846757
r2_score 0.8202961420180037
r2_score 0.8230318210374458
r2_score 0.8259731308258862
r2_score 0.8168200335999904
r2_score 0.81617838

In [33]:
X = torch.Tensor(X_test.values).view(X_test.shape[0],-1)
y = torch.Tensor(y_test).view(y_test.shape[0],1)
sigma = net.forward(X).detach().numpy()
print('r2_score',r2_score(y, sigma))

r2_score 0.6841867638586654


In [89]:
X = torch.tensor(test.values, dtype=torch.float32)
sigma = net.forward(X).detach().numpy()
sub = pd.Series(sigma.flatten(), index=test.index, name='FloodProbability')
filename = 'submission.csv'
sub.to_csv(filename)

In [90]:
test

Unnamed: 0_level_0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,sort_10,sort_11,sort_12,sort_13,sort_14,sort_15,sort_16,sort_17,sort_18,sort_19
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1117957,4,6,3,5,6,7,8,7,8,4,...,6,6,6,6,7,7,7,8,8,8
1117958,4,4,2,9,5,5,4,7,5,4,...,4,4,5,5,5,5,7,7,7,9
1117959,1,3,6,5,7,2,4,6,4,2,...,5,5,5,6,6,6,7,7,8,9
1117960,2,4,4,6,4,5,4,3,4,4,...,4,4,4,5,6,6,6,7,7,8
1117961,6,3,2,4,6,4,5,5,3,7,...,5,5,5,5,6,6,6,6,7,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1863257,5,4,8,3,5,4,4,5,5,5,...,5,5,5,5,5,6,6,6,6,8
1863258,4,4,2,12,4,3,4,3,5,5,...,4,4,4,5,5,5,5,5,7,12
1863259,5,7,9,5,5,6,7,5,5,3,...,5,5,6,6,7,7,9,9,11,11
1863260,4,7,6,3,5,2,3,8,6,7,...,6,6,6,6,7,7,7,8,8,8
