# Preparation

### Upload own kaggle API

In [2]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 63 bytes


### Download competition data from kaggle

In [3]:
!kaggle competitions download -c 2023-ntust-data-science-hw2

Downloading 2023-ntust-data-science-hw2.zip to /content
 82% 1.00M/1.22M [00:00<00:00, 1.43MB/s]
100% 1.22M/1.22M [00:00<00:00, 1.68MB/s]


### Extract the compessed dataset

In [4]:
from zipfile import ZipFile
dataset = '/content/2023-ntust-data-science-hw2.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


# Implementation

In [14]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import numpy as np

# Step 1: Load and Explore Data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test_4000.csv')

# Step 2: Data Preprocessing
# Drop missing values
train_data = train_data.dropna()
test_data = test_data.dropna()

# Update the features used for training the scaler and DBSCAN
features_for_scaling_and_clustering = ['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4', 'Feature 5', 'Feature 6',
                                       'Feature 7', 'Feature 8', 'Feature 9', 'Feature 10', 'Feature 11', 'Feature 12', 'Feature 13']

# Extract the relevant features from the train data
train_features = train_data[features_for_scaling_and_clustering]

# Separate numeric and categorical features
numeric_features = train_features.drop('Feature 13', axis=1)

# Step 3: Feature Scaling
# Standardize the numeric data to have mean=0 and variance=1
numeric_transformer = StandardScaler()

# Combine the transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features.columns)
    ])

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit the preprocessor on the training data
numeric_features_scaled = pipeline.named_steps['preprocessor'].fit_transform(numeric_features)

# Step 4: Density-Based Clustering (DBSCAN)
eps_value = 2.5
min_samples_value = 3
dbscan = DBSCAN(eps=eps_value, min_samples=min_samples_value)
train_data['cluster'] = dbscan.fit_predict(numeric_features_scaled)

# Step 5: Prediction on Test Set
# Extract the relevant features from the test data for col_1
test_features_col_1 = test_data.merge(train_data, left_on='col_1', right_on='song_id', how='inner')[features_for_scaling_and_clustering]
test_features_col_1_transformed = pipeline.named_steps['preprocessor'].transform(test_features_col_1)
test_data['cluster_col_1'] = dbscan.fit_predict(test_features_col_1_transformed)

# Extract the relevant features from the test data for col_2
test_features_col_2 = test_data.merge(train_data, left_on='col_2', right_on='song_id', how='inner')[features_for_scaling_and_clustering]
test_features_col_2_transformed = pipeline.named_steps['preprocessor'].transform(test_features_col_2)
test_data['cluster_col_2'] = dbscan.fit_predict(test_features_col_2_transformed)

# Step 6: Compare Clusters and Write to Submission File
dfs = []
for index, row in test_data.iterrows():
    cluster_col_1 = row['cluster_col_1']
    cluster_col_2 = row['cluster_col_2']
    same_group = cluster_col_1 == cluster_col_2
    dfs.append(pd.DataFrame({'id': [row['id']], 'ans': [int(same_group)]}))

submission_df = pd.concat(dfs, ignore_index=True)

# Step 7: Save Submission File
submission_df.to_csv('Submit.csv', index=False)