# Preparation

### Upload own kaggle API

In [1]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 63 bytes


### Download competition data from kaggle

In [2]:
!kaggle competitions download -c 2023-data-science-hw1

Downloading 2023-data-science-hw1.zip to /content
  0% 0.00/493k [00:00<?, ?B/s]
100% 493k/493k [00:00<00:00, 93.8MB/s]


### Extract the compessed dataset

In [3]:
from zipfile import ZipFile
dataset = '/content/2023-data-science-hw1.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


# Implementation

Import the necessary libraries and modules, including pandas for data manipulation, numpy for numerical operations, sklearn for machine learning, XGBClassifier for XGBoost, and shuffle to shuffle the data.

In [45]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from xgboost.sklearn import XGBClassifier
from sklearn.utils import shuffle
from pandas.core.frame import DataFrame

A seed value is set for reproducibility.

In [46]:
seed = 42

Read the training and test datasets from CSV files.

In [47]:
# Separate Data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

Remove rows with missing values from both the training and test datasets.

In [48]:
# Handle Missing Values
train = train.dropna(axis=0, how='any')
test = test.dropna(axis=0, how='any')

Oversample the minority class ('Yes' in 'Attribute21') by generating additional instances of the 'No' class and then concatenating them with the original 'Yes' class.

In [49]:
# oversample the minority class
train = train.groupby('Attribute21')
no = train.get_group('No')
yes = train.get_group('Yes')

# oversample the 'No' class to balance the data
oversampled_no = no.sample(len(yes), replace=True, random_state=seed)
#oversampled_no = no.sample(len(yes))

# concatenate the oversampled 'No' class with the 'Yes' class
balanced_train = pd.concat([yes, oversampled_no], axis=0)

Shuffle the rows of the concatenated dataset for randomness.

In [50]:
# Shuffle the Data
balanced_train = balanced_train.sample(frac=1, random_state=seed)
#balanced_train = shuffle(balanced_train)

Preprocess the data.

In [51]:
# Extract the target variable
train_result = balanced_train.values[:, -1]
train_result = np.where(train_result == 'Yes', 1, 0)
# Drop unnecessary columns
balanced_train = balanced_train.drop(['Attribute21'], axis=1)
balanced_train = balanced_train.drop(['Attribute1'], axis=1)
# Apply one-hot encoding to categorical columns
balanced_train = pd.get_dummies(data=balanced_train, columns=['Attribute8', 'Attribute9', 'Attribute20'])
# Convert the pandas DataFrame into a NumPy array
balanced_train = balanced_train.values

# Drop unnecessary columns
test = test.drop(['Attribute1'], axis=1)
# Apply one-hot encoding to categorical columns
test = pd.get_dummies(data=test, columns=['Attribute8', 'Attribute9', 'Attribute20'])
# Convert the pandas DataFrame into a NumPy array
test = test.values

 Initialize an XGBoost classifier and train it on the preprocessed training data.

In [None]:
# Train the model
module = XGBClassifier(colsample_bytree=0.8,
                       learning_rate=0.1,
                       max_depth=27,
                       missing=-999,
                       n_estimators=500,
                       subsample=0.8)

module.fit(balanced_train, train_result)

Use the trained model to make predictions on the test data.

In [53]:
# Make predictions
predict = module.predict(test)

Create a submission file in CSV format with the predicted results and save it. The header contains columns 'id' and 'ans', where 'id' is the index and 'ans' is the predicted label.

In [54]:
# Create and save submission file
header = [['id', 'ans']]
for i in range(len(predict)):
    header.append([float(i), str(int(predict[i]))])

result = DataFrame(header)
result.to_csv("my_submit.csv", header=None, index=False, encoding='utf-8')