In [1]:
import datetime
from sklearn.base import clone
from sklearn.model_selection import KFold,train_test_split
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import r2_score
from colorama import Fore, Style
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, SplineTransformer, OneHotEncoder
from sklearn.linear_model import Ridge, LinearRegression
import matplotlib.pyplot as plt
import torch
from torch.optim import Adam,SGD
from torch.utils.data import DataLoader

In [12]:
train = pd.read_csv('dataset/train.csv', index_col='id')
test = pd.read_csv('dataset/test.csv', index_col='id')
initial_features = list(train.columns)[:-1]

y = train[initial_features]
for df in [train, test]:
    #df['fsum'] = df[initial_features].sum(axis=1) # for tree models
    df = pd.get_dummies(df)


onehot_y = pd.get_dummies(train["FloodProbability"])
X_train, X_test, y_train, y_test = train_test_split(train[initial_features], onehot_y, test_size=0.2, shuffle=True, random_state=42)

In [13]:
def get_summ_info(df):
    print(f'data shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values
    summ['%missing'] = df.isnull().sum().values / len(df) * 100
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
    summ['first value'] = df.loc[0].values
    summ['second value'] = df.loc[1].values
    summ['third value'] = df.loc[2].values
    return summ.style.background_gradient(cmap='Blues')

In [14]:
get_summ_info(train)

data shape: (1117957, 43)


Unnamed: 0,data type,#missing,%missing,#unique,min,max,first value,second value,third value
MonsoonIntensity,int64,0,0.0,17,0.0,16.0,5,6,6
TopographyDrainage,int64,0,0.0,19,0.0,18.0,8,7,5
RiverManagement,int64,0,0.0,17,0.0,16.0,5,4,6
Deforestation,int64,0,0.0,18,0.0,17.0,8,4,7
Urbanization,int64,0,0.0,18,0.0,17.0,6,8,3
ClimateChange,int64,0,0.0,18,0.0,17.0,4,8,7
DamsQuality,int64,0,0.0,17,0.0,16.0,4,3,1
Siltation,int64,0,0.0,17,0.0,16.0,3,5,5
AgriculturalPractices,int64,0,0.0,17,0.0,16.0,3,4,4
Encroachments,int64,0,0.0,19,0.0,18.0,4,6,5


In [15]:
def cross_validate(model=None, label_name='', n_repeats=1, SINGLE_FOLD=False, COMPUTE_TEST_PRED=False):
    """
    SINGLE_FOLD: 是否只执行一个kf就break
    COMPUTE_TEST_PRED: 交叉验证后是否再完整训练一次
    """
    if model is None:
        model = make_pipeline(StandardScaler(), LinearRegression())
    features = list(test.columns)
    kf = KFold(n_splits=5, shuffle=True, random_state=1)
    start_time = datetime.datetime.now()
    scores = []
    oof_preds = np.full_like(train.FloodProbability, np.nan, dtype=float)
    for fold, (idx_tr, idx_va) in enumerate(kf.split(train)):
        X_tr = train.iloc[idx_tr][features]
        X_va = train.iloc[idx_va][features]
        y_tr = train.iloc[idx_tr].FloodProbability
        y_va = train.iloc[idx_va].FloodProbability

        y_pred = np.zeros_like(y_va, dtype=float)
        for i in range(n_repeats):
            m = clone(model)
            if n_repeats > 1:
                mm = m
                if isinstance(mm, Pipeline):
                    mm = mm[-1]
                mm.set_params(random_state=i)
            m.fit(X_tr, y_tr)
            y_pred += m.predict(X_va)
        y_pred /= n_repeats

        score = r2_score(y_va, y_pred)
        print(f"# Fold {fold}: R2={score:.5f}")
        scores.append(score)
        oof_preds[idx_va] = y_pred
        if SINGLE_FOLD: break

    elapsed_time = datetime.datetime.now() - start_time
    print(f"{Fore.GREEN}# Overall: {np.array(scores).mean():.5f} {label_name}"
          f"{' single fold' if SINGLE_FOLD else ''}"
          f"   {int(np.round(elapsed_time.total_seconds() / 60))} min{Style.RESET_ALL}")

    if COMPUTE_TEST_PRED:
        # Retrain n_repeats times with the whole dataset and average
        y_pred = np.zeros(len(test), dtype=float)
        X_tr = train[features]
        y_tr = train.FloodProbability
        for i in range(n_repeats):
            m = clone(model)
            if n_repeats > 1:
                mm = m
                if isinstance(mm, Pipeline):
                    mm = mm[-1]

                mm.set_params(random_state=i)
            m.fit(X_tr, y_tr)
            y_pred += m.predict(test[features])
        y_pred /= n_repeats
        print(f"y_pred:", y_pred)

In [16]:
cross_validate()

# Fold 0: R2=0.85231
# Fold 1: R2=0.85167
# Fold 2: R2=0.85102
# Fold 3: R2=0.85114
# Fold 4: R2=0.85111
[32m# Overall: 0.85145    0 min[0m


In [135]:
from torch.utils.data import Dataset, DataLoader
class CustomDataset(Dataset):
    def __init__(self, X, y):
        # 在这里将pandas的数据转换为张量
        self.X = torch.tensor(X.values, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32)
    
    def __getitem__(self, index):
        # 运行数据加载器时，会返回下面的值
        X = self.X[index]
        y = self.y[index]
        return X, y
    
    def __len__(self):
        # 返回数据集的大小
        return len(self.y)

In [156]:
torch.random.manual_seed(420)
net = Model()
criterion = torch.nn.CrossEntropyLoss()
opt = Adam(net.parameters(), lr=0.005)

dataset = CustomDataset(X_train, y_train)
batch_data = DataLoader(dataset=dataset, batch_size=4096, shuffle=True, drop_last=False)

onehot_y_columns = onehot_y.columns
for n_ in range(3):
    for batch in batch_data: 
        X = batch[0]
        y = batch[1]
        sigma = net.forward(X)
        loss = criterion(sigma, y)

        sigma_df = pd.DataFrame(sigma.detach().numpy())
        max_ids = sigma_df.idxmax(axis=1)
        max_labels =[onehot_y_columns[i] for i in max_ids]

        y_df = pd.DataFrame(y)
        max_ids = y_df.idxmax(axis=1)
        max_y =[onehot_y_columns[i] for i in max_ids]
        
        print('r2_score',r2_score(max_y, max_labels))
        loss.backward()
        opt.step()
        net.zero_grad()

r2_score -5.345542972177307
r2_score -0.26418401750190856
r2_score -0.375037691844915
r2_score -0.04468736677701535
r2_score 0.07188056503837337
r2_score 0.1213167319167846
r2_score 0.33517595789718035
r2_score 0.35340463276706413
r2_score 0.39338617369159656
r2_score 0.534285735674547
r2_score 0.5938937547534734
r2_score 0.5617089267539286
r2_score 0.5973483402610099
r2_score 0.6443814918716481
r2_score 0.6698620725156612
r2_score 0.6682829671516544
r2_score 0.6590600765946649
r2_score 0.6510717763017515
r2_score 0.6864344289221063
r2_score 0.6913936715112625
r2_score 0.6946028534336235
r2_score 0.7117798549536146
r2_score 0.7472609867233082
r2_score 0.7569315182816604
r2_score 0.7560575880234378
r2_score 0.6994618426475773
r2_score 0.6971481285032537
r2_score 0.7494961060430938
r2_score 0.7585778371243769
r2_score 0.760020215457997
r2_score 0.7582846365340433
r2_score 0.7825901795360797
r2_score 0.7856942481537559
r2_score 0.7620958692322044
r2_score 0.674946521899415
r2_score 0.7116

In [159]:
X = torch.Tensor(X_test.values)
y = torch.Tensor(y_test.values)
sigma = net.forward(X)
sigma_df = pd.DataFrame(sigma.detach().numpy())
max_ids = sigma_df.idxmax(axis=1)
max_labels =[onehot_y_columns[i] for i in max_ids]

y_df = pd.DataFrame(y)
max_ids = y_df.idxmax(axis=1)
max_y =[onehot_y_columns[i] for i in max_ids]
print('r2_score',r2_score(max_y, max_labels))

r2_score 0.7654835185186533


In [161]:
torch.random.manual_seed(420)
net = Model()
criterion = torch.nn.CrossEntropyLoss()
opt = Adam(net.parameters(), lr=0.005)

dataset = CustomDataset(X_train, y_train)
batch_data = DataLoader(dataset=dataset, batch_size=4096, shuffle=True, drop_last=False)

onehot_y_columns = onehot_y.columns
for n_ in range(3):
    for batch in batch_data: 
        X = batch[0]
        y = batch[1]
        sigma = net.forward(X)
        loss = criterion(sigma, y)

        sigma_df = pd.DataFrame(sigma.detach().numpy())
        max_ids = sigma_df.idxmax(axis=1)
        max_labels =[onehot_y_columns[i] for i in max_ids]

        y_df = pd.DataFrame(y)
        max_ids = y_df.idxmax(axis=1)
        max_y =[onehot_y_columns[i] for i in max_ids]
        
        print('r2_score',r2_score(max_y, max_labels))
        loss.backward()
        opt.step()
        net.zero_grad()

r2_score -5.345542972177307
r2_score -0.26418401750190856
r2_score -0.375037691844915
r2_score -0.04468736677701535
r2_score 0.07188056503837337
r2_score 0.1213167319167846
r2_score 0.33517595789718035
r2_score 0.35340463276706413
r2_score 0.39338617369159656
r2_score 0.534285735674547
r2_score 0.5938937547534734
r2_score 0.5617089267539286
r2_score 0.5973483402610099
r2_score 0.6443814918716481
r2_score 0.6698620725156612
r2_score 0.6682829671516544
r2_score 0.6590600765946649
r2_score 0.6510717763017515
r2_score 0.6864344289221063
r2_score 0.6913936715112625
r2_score 0.6946028534336235
r2_score 0.7117798549536146
r2_score 0.7472609867233082
r2_score 0.7569315182816604
r2_score 0.7560575880234378
r2_score 0.6994618426475773
r2_score 0.6971481285032537
r2_score 0.7494961060430938
r2_score 0.7585778371243769
r2_score 0.760020215457997
r2_score 0.7582846365340433
r2_score 0.7825901795360797
r2_score 0.7856942481537559
r2_score 0.7620958692322044
r2_score 0.674946521899415
r2_score 0.7116

In [164]:
X = torch.Tensor(X_test.values)
y = torch.Tensor(y_test.values)
sigma = net.forward(X)
sigma_df = pd.DataFrame(sigma.detach().numpy())
max_ids = sigma_df.idxmax(axis=1)
max_labels =[onehot_y_columns[i] for i in max_ids]

y_df = pd.DataFrame(y)
max_ids = y_df.idxmax(axis=1)
max_y =[onehot_y_columns[i] for i in max_ids]
print('r2_score',r2_score(max_y, max_labels))

r2_score 0.7654629998224096


In [167]:
X = torch.Tensor(test.values).view(test.shape[0],-1)
sigma = net.forward(X).detach().numpy()
sigma_df = pd.DataFrame(sigma)
max_ids = sigma_df.idxmax(axis=1)
max_labels =[onehot_y_columns[i] for i in max_ids]
sub = pd.Series(max_labels, index=test.index, name='FloodProbability')
filename = 'submission.csv'
sub.to_csv(filename)