# STATS 503 Kaggle Competition
Group Name: Tongyao Team  
Group Members: TONGYAO JIANG, Yuxiang Gao, Chenfei Wang

## Data processing

### 1.import packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

### 2.train/test split

In [256]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [1]:
# train_df_info = train_df.info()
# test_df_info = test_df.info()
# train_df_head = train_df.head()
# test_df_head = test_df.head()

In [258]:
X_train = train_df.drop(['SEQN', 'y'], axis=1)
y_train = train_df['y']
X_pred = test_df.drop(['SEQN'], axis=1)
# X_train.head()
# X_pred.head()

In [261]:
srp_cols = [col for col in X_train.columns if col.startswith('SRP_')]

In [262]:
categorical_cols = ['district']
numerical_cols = [col for col in X_train.columns if col not in categorical_cols]

In [2]:
# X_train.columns

In [264]:
# X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

### 3.Preprocess the data (with PCA)

In [265]:
preprocessor_with_pca = ColumnTransformer(
    transformers=[
        ('srp_pca', PCA(n_components=0.95), srp_cols),
        ('num', StandardScaler(), list(set(numerical_cols) - set(srp_cols))),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

In [266]:
preprocess_result = preprocessor_with_pca.fit_transform(X_train)

In [3]:
# preprocess_result

In [268]:
X_train_preprocessed = pd.DataFrame(preprocess_result)

In [4]:
# X_train_preprocessed

### 4.Preprocess the data (without PCA)

In [270]:
preprocessor_no_pca = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), list(set(numerical_cols))),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

In [271]:
preprocess_result2 = preprocessor_no_pca.fit_transform(X_train)

In [5]:
# preprocess_result2

In [273]:
X_train_preprocessed2 = pd.DataFrame(preprocess_result2)

In [6]:
# X_train_preprocessed2

In [275]:
preprocess_result_pred = preprocessor_no_pca.fit_transform(X_pred)

In [7]:
# preprocess_result_pred

In [277]:
X_pred_preprocessed = pd.DataFrame(preprocess_result_pred)

In [8]:
# X_pred_preprocessed

## Models

In [279]:
X_train, X_test, y_train, y_test = train_test_split(X_train_preprocessed2, y_train, test_size=0.2, random_state=42)

### Method1: GBR

In [290]:
model_GBR = GradientBoostingRegressor()

In [291]:
model_GBR.fit(X_train, y_train)

In [292]:
model_GBR.score(X_train, y_train)

0.7044288309453615

In [299]:
model_GBR.score(X_test, y_test)

0.5853344682902979

In [293]:
predictions = model_GBR.predict(X_pred_preprocessed)

In [294]:
output_df = pd.DataFrame({'SEQN': test_df['SEQN'], 'y': predictions})

In [295]:
output_df.head()

Unnamed: 0,SEQN,y
0,492834,0.616448
1,309349,1.806131
2,468308,1.373551
3,838812,0.178037
4,947936,1.490784


In [296]:
# output_file_path = 'kaggle_sub1.csv'
# output_df.to_csv(output_file_path, index=False)

### Method2: Linear Regression

In [300]:
model_LR = LinearRegression()

In [301]:
model_LR.fit(X_train, y_train)

In [358]:
model_LR.score(X_train, y_train)

0.6420549058698753

In [359]:
model_LR.score(X_test, y_test)

0.6512748925966163

In [304]:
pred_LR = model_LR.predict(X_pred_preprocessed)

In [305]:
output_LR = pd.DataFrame({'SEQN': test_df['SEQN'], 'y': pred_LR})

In [306]:
output_LR.head()

Unnamed: 0,SEQN,y
0,492834,-1.056414
1,309349,1.44038
2,468308,0.893624
3,838812,-0.171051
4,947936,1.944869


In [307]:
# output_LR_path = 'kaggle_sub_LR.csv'
# output_LR.to_csv(output_LR_path, index=False)

### Method3: Random Forest

In [308]:
model_RF = RandomForestRegressor(random_state=42)

In [309]:
model_RF.fit(X_train, y_train)

In [310]:
model_RF.score(X_train, y_train)

0.9440333271876391

In [311]:
model_RF.score(X_test, y_test)

0.6061024919807707

In [312]:
pred_RF = model_RF.predict(X_pred_preprocessed)

In [313]:
output_RF = pd.DataFrame({'SEQN': test_df['SEQN'], 'y': pred_RF})

In [314]:
output_RF.head()

Unnamed: 0,SEQN,y
0,492834,0.79638
1,309349,1.85976
2,468308,1.59232
3,838812,0.14064
4,947936,1.68643


In [315]:
# output_RF_path = 'kaggle_sub_RF.csv'
# output_RF.to_csv(output_RF_path, index=False)

### Method4: Neural Network

In [316]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [319]:
model_NN = MLPRegressor(random_state=1, activation='relu', max_iter=1000)

In [320]:
model_NN.fit(X_train, y_train)

In [321]:
model_NN.score(X_train, y_train)

0.9879074683268374

In [322]:
model_NN.score(X_test, y_test)

0.7367898923692984

In [323]:
NN_cv_scores = cross_val_score(model_NN, X_train, y_train, cv=5, scoring='r2')

In [324]:
NN_cv_scores

array([0.69829826, 0.7361461 , 0.69750374, 0.71965715, 0.71770138])

#### 1.Grid Search

In [325]:
model_NN_GS = MLPRegressor(random_state=1, max_iter=500)

In [326]:
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50,50)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
}

In [327]:
grid_search = GridSearchCV(model_NN_GS, param_grid, cv=3, scoring='r2')

In [328]:
# grid_search.fit(X_train_preprocessed, y_train)

In [329]:
# print("Best parameters:", grid_search.best_params_)
# print("Best cross-validation score: {:.2f}".format(-grid_search.best_score_))

In [330]:
# grid_search.score(X_train_preprocessed, y_train)

#### 2. using Pytorch ReLu

In [None]:
# !pip install torch

In [341]:
y_train_torch = train_df['y']

In [342]:
import torch
import torch.nn as nn
import torch.optim as optim

In [343]:
class StudentPerformancePredictor(nn.Module):
    def __init__(self):
        super(StudentPerformancePredictor, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(60, 128),  # Input layer with 60 features
            nn.ReLU(),           # Activation function
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.network(x)

In [344]:
model_torch = StudentPerformancePredictor()
criterion = nn.MSELoss()
optimizer = optim.Adam(model_torch.parameters(), lr=0.001)

In [345]:
model_torch

StudentPerformancePredictor(
  (network): Sequential(
    (0): Linear(in_features=60, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): ReLU()
    (6): Linear(in_features=32, out_features=1, bias=True)
  )
)

In [346]:
train_features_tensor = torch.tensor(preprocess_result2, dtype=torch.float32)
train_labels_tensor = torch.tensor(y_train_torch.values, dtype=torch.float32).view(-1, 1)

In [347]:
epochs = 200
batch_size = 64

In [348]:
def r2_score(outputs, labels):
    total_variance = torch.sum((labels - labels.mean()) ** 2)
    residual_variance = torch.sum((labels - outputs) ** 2)
    r2 = 1 - (residual_variance / total_variance)
    return r2

In [349]:
# Training loop
for epoch in range(epochs):
    permutation = torch.randperm(train_features_tensor.size()[0])
    for i in range(0, train_features_tensor.size()[0], batch_size):
        indices = permutation[i:i+batch_size]
        batch_x, batch_y = train_features_tensor[indices], train_labels_tensor[indices]

        outputs = model_torch(batch_x)
        loss = criterion(outputs, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model_torch.eval()
    with torch.no_grad():
        train_outputs = model_torch(train_features_tensor)
        train_r2 = r2_score(train_outputs, train_labels_tensor)

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}, R^2: {train_r2:.4f}')

Epoch [10/200], Loss: 0.1880, R^2: 0.9320
Epoch [20/200], Loss: 0.1407, R^2: 0.9630
Epoch [30/200], Loss: 0.0619, R^2: 0.9786
Epoch [40/200], Loss: 0.0601, R^2: 0.9878
Epoch [50/200], Loss: 0.0279, R^2: 0.9913
Epoch [60/200], Loss: 0.0154, R^2: 0.9926
Epoch [70/200], Loss: 0.0155, R^2: 0.9937
Epoch [80/200], Loss: 0.0195, R^2: 0.9926
Epoch [90/200], Loss: 0.0096, R^2: 0.9942
Epoch [100/200], Loss: 0.0181, R^2: 0.9932
Epoch [110/200], Loss: 0.0118, R^2: 0.9935
Epoch [120/200], Loss: 0.0105, R^2: 0.9948
Epoch [130/200], Loss: 0.0120, R^2: 0.9957
Epoch [140/200], Loss: 0.0051, R^2: 0.9970
Epoch [150/200], Loss: 0.0062, R^2: 0.9960
Epoch [160/200], Loss: 0.0090, R^2: 0.9957
Epoch [170/200], Loss: 0.0065, R^2: 0.9969
Epoch [180/200], Loss: 0.0046, R^2: 0.9972
Epoch [190/200], Loss: 0.0055, R^2: 0.9970
Epoch [200/200], Loss: 0.0061, R^2: 0.9969


In [350]:
test_features_tensor = torch.tensor(preprocess_result_pred, dtype=torch.float32)

with torch.no_grad():
    predictions = model_torch(test_features_tensor)

In [351]:
predicted_scores = predictions.numpy()
# print(predicted_scores)

[[-0.14556947]
 [ 1.5869352 ]
 [ 1.1282394 ]
 ...
 [-0.07992034]
 [-1.1706951 ]
 [-0.92825353]]


In [352]:
predictions_array = predictions.numpy()

predicted_df = pd.DataFrame({
    'SEQN': test_df['SEQN'],
    'y': predictions_array.flatten()
})

In [353]:
predicted_df.to_csv('kaggle_sub_torch2.csv', index=False)

##### prediction NN

In [354]:
# pred_NN = model_NN.predict(X_pred)

In [355]:
# output_NN = pd.DataFrame({'SEQN': test_df['SEQN'], 'y': pred_NN})

In [356]:
# output_NN.head()

In [357]:
# output_NN_path = 'kaggle_sub_NN.csv'
# output_NN.to_csv(output_NN_path, index=False)