In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder

In [2]:
# Load the data
data = pd.read_csv(r'C:\Users\u1158100\Desktop\GDG_research\delisting_detection_py_project\raw_data\result.csv')

# Data sampling
delist_1_data = data[data['delist_tab'] == 1]
delist_0_sample = data[data['delist_tab'] == 0].sample(n=1000, random_state=816)
combined_data = pd.concat([delist_1_data, delist_0_sample], axis=0).reset_index(drop=True)

print(data.shape)
print(combined_data.shape)
# combined_data.head()

(50801, 77)
(1186, 77)


In [3]:
# categorical variables name list
categorical_vars = ['atc1', 'mnflg', 'vbp_flag', 'VBP_Batch', 'VBP_time', 'NRDL', 'NRDL_LIMTT', 'ENTRY_TIME']

# Create a list of numerical variables excluding the ones provided and the outcome 
numerical_vars = [col for col in combined_data.columns if col not in categorical_vars + ['delist_tab']]

# Convert all values in categorical columns to string
for cat_var in categorical_vars:
    combined_data[cat_var] = combined_data[cat_var].astype(str)

# Retry Label Encoding
for cat_var in categorical_vars:
    le = LabelEncoder()
    combined_data[cat_var] = le.fit_transform(combined_data[cat_var])

# Fill NAs in other variables with 0
combined_data[numerical_vars] = combined_data[numerical_vars].fillna(0)

# Check if there are any remaining null values
combined_data.isnull().sum().sum()

0

In [4]:
# One-Hot Encoding
ohe = OneHotEncoder(dtype=float, sparse=False)
df_encoded = pd.DataFrame(ohe.fit_transform(combined_data[categorical_vars]))
df_encoded.columns = ohe.get_feature_names(categorical_vars)

# Drop original categorical columns and append the one-hot encoded columns
combined_data = combined_data.drop(columns=categorical_vars)
combined_data = pd.concat([combined_data, df_encoded], axis=1)

# 4. Scale numerical variables
scaler = StandardScaler()
combined_data[numerical_vars] = scaler.fit_transform(combined_data[numerical_vars])

# Display the head of the dataset
combined_data.head()



Unnamed: 0,delist_tab,month_count,_201901,_201902,_201903,_201904,_201905,_201911,_201912,_202007,...,NRDL_0,NRDL_1,NRDL_2,NRDL_LIMTT_0,NRDL_LIMTT_1,ENTRY_TIME_0,ENTRY_TIME_1,ENTRY_TIME_2,ENTRY_TIME_3,ENTRY_TIME_4
0,1,-1.418036,-0.201737,-0.197625,-0.200282,-0.199097,-0.19918,-0.20261,-0.198688,-0.196944,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,1,-0.04842,-0.201422,-0.194895,-0.198902,-0.198658,-0.198258,-0.201095,-0.197507,-0.196199,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1,-1.466951,-0.201755,-0.197625,-0.200282,-0.199097,-0.19918,-0.20261,-0.198688,-0.196944,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1,-1.466951,-0.201755,-0.197625,-0.200282,-0.199097,-0.19918,-0.20261,-0.198688,-0.196944,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,1,-1.222377,-0.201541,-0.197465,-0.200263,-0.199097,-0.199144,-0.20261,-0.198688,-0.196944,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [5]:
# Output dataset
combined_data.to_csv(r'C:\Users\u1158100\Desktop\GDG_research\delisting_detection_py_project\processed\processed_data.csv', index = False)