# STAT8107 Data Mining Techniques
## Group Project

**Members (UID):**  
TBC

# Import modules and configuration

In [1]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt

from typing import List

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

%matplotlib inline

# Functions

In [59]:
def plt_add_labels(x: List[str],
                   y: List[int]) -> None:
    y_scale = y.mean()/100
    for i in range(len(x)):
        plt.text(i, y[i]+y_scale, y[i], ha = 'center')

def print_missing_val_count(df: pd.DataFrame) -> None:
    # Missing value counts
    df_na_cnt = df.isnull().sum()
    df_record_cnt = df.shape[0]
    
    # Print the count of missing value for each feature
    print("The following columns have missing values:")
    for col, na_cnt in zip(df_na_cnt.index, df_na_cnt.values):
        if na_cnt > 0:
            print(f"{col} has {na_cnt} ({100*na_cnt/df_record_cnt:0.1f}%) missing value.")

# Load Data

In [3]:
# Read csv files
price_train_df = pd.read_csv('trainPrice.csv')
price_test_df = pd.read_csv('testPrice.csv')
school_df = pd.read_csv('Schools.csv')
subway_df = pd.read_csv('Subways.csv')
submission_price_df = pd.read_csv('submissionPrice.csv')

## Input Files adn Their Basic Information

In [4]:
price_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1601458 entries, 0 to 1601457
Data columns (total 25 columns):
 #   Column                              Non-Null Count    Dtype  
---  ------                              --------------    -----  
 0   key                                 1601458 non-null  int64  
 1   apartment_id                        1601458 non-null  int64  
 2   city                                1601458 non-null  int64  
 3   transaction_year_month              1601458 non-null  int64  
 4   transaction_date                    1601458 non-null  object 
 5   year_of_completion                  1601458 non-null  int64  
 6   exclusive_use_area                  1601458 non-null  float64
 7   floor                               1601458 non-null  int64  
 8   latitude                            1601458 non-null  float64
 9   longitude                           1601458 non-null  float64
 10  address_by_law                      1601458 non-null  int64  
 11  total_parki

In [5]:
# price_train_df.head(10)

In [6]:
price_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3918 entries, 0 to 3917
Data columns (total 25 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   key                                 3918 non-null   int64  
 1   apartment_id                        3918 non-null   int64  
 2   city                                3918 non-null   int64  
 3   transaction_year_month              3918 non-null   int64  
 4   transaction_date                    3918 non-null   object 
 5   year_of_completion                  3918 non-null   int64  
 6   exclusive_use_area                  3918 non-null   float64
 7   floor                               3918 non-null   int64  
 8   latitude                            3918 non-null   float64
 9   longitude                           3918 non-null   float64
 10  address_by_law                      3918 non-null   int64  
 11  total_parking_capacity_in_site      3547 no

In [7]:
# price_test_df.head(10)

In [8]:
school_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1921 entries, 0 to 1920
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   school_code      1921 non-null   object 
 1   latitude         1921 non-null   float64
 2   longitude        1921 non-null   float64
 3   school_class     1921 non-null   object 
 4   operation_type   1921 non-null   object 
 5   highschool_type  462 non-null    object 
 6   gender           1921 non-null   object 
 7   foundation_date  1921 non-null   object 
 8   address_by_law   1921 non-null   int64  
dtypes: float64(2), int64(1), object(6)
memory usage: 135.2+ KB


In [9]:
# school_df.head(10)

In [10]:
subway_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 405 entries, 0 to 404
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   station_id      405 non-null    int64  
 1   latitude        405 non-null    float64
 2   longitude       405 non-null    float64
 3   subway_line     405 non-null    object 
 4   address_by_law  396 non-null    float64
dtypes: float64(3), int64(1), object(1)
memory usage: 15.9+ KB


In [11]:
# subway_df.head(10)

In [12]:
submission_price_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3918 entries, 0 to 3917
Data columns (total 2 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   key                     3918 non-null   int64
 1   transaction_real_price  3918 non-null   int64
dtypes: int64(2)
memory usage: 61.3 KB


# Exploratory Data Analysis (EDA)

## Training Set (trainPrice.csv)

## Univariate Analysis

## Field - City

In [13]:
# Distinct values
price_train_df["city"].drop_duplicates().sort_values().to_list() # There are only 2 distinct values

[0, 1]

In [14]:
# Sample few (latitude, longitude) pairs from records with city = 0
price_train_df[price_train_df["city"] == 0][["latitude", "longitude"]].head(3)

Unnamed: 0,latitude,longitude
1491,35.11756,129.011018
1492,35.118632,129.010886
1493,35.081732,129.019474


In [15]:
# Sample few (latitude, longitude) pairs from records with city = 1
price_train_df[price_train_df["city"] == 1][["latitude", "longitude"]].head(3)

Unnamed: 0,latitude,longitude
0,37.585965,127.000231
1,37.585965,127.000231
2,37.580511,127.014016


## Field - transaction_real_price

In [16]:
# # Histogram of Transaction Real Price
# fig, ax = plt.subplots(figsize=(15, 8))
# plt.hist(price_train_df["transaction_real_price"]/1000000, bins=100)
# plt.title('Histogram of Transaction Real Price')
# plt.xlabel('Transaction Real Price (1,000,000)')
# plt.ylabel('Count')
# plt.show()

In [17]:
print(f"Maximum transaction price is ${int(price_train_df['transaction_real_price'].max()/1000000)}M")

Maximum transaction price is $8200M


## Field - transaction_year_month

In [35]:
# # Bar plot of Transaction Year Month
# # Drop missing values (NA) and calculate the counts of each value
# val_count = price_train_df["transaction_year_month"].dropna().astype("str").value_counts()
# val_count = val_count.sort_index()

# val_mean = val_count.mean()

# # Draw bar plot
# fig, ax = plt.subplots(figsize=(15, 8))
# plt.bar(val_count.index, val_count.values)
# plt.title('Bar Plot of Transaction Year Month')
# plt.xlabel('Transaction Year Month')
# plt.ylabel('Count')
# plt.xticks(range(len(val_count.index))[::2], val_count.index[::2], rotation=90)
# plt.show()

# print(f"The mean of transaction count per month is {val_mean: .2f}.")

## Fields - total_parking_capacity_in_site, total_household_count_in_sites, room_count and bathroom_count

In [19]:
# # Create a figure and add the axes for subgraphs
# fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))

# # Histogram of Total Parking Capacity
# axes[0,0].hist(price_train_df["total_parking_capacity_in_site"], bins=30)
# axes[0,0].set(title='Histogram of Total Parking Capacity')
# axes[0,0].set_xlabel('Total Parking Capacity')
# axes[0,0].set_ylabel('Count')

# # Histogram of Total Household Count
# axes[0,1].hist(price_train_df["total_household_count_in_sites"], bins=30)
# axes[0,1].set(title='Histogram of Total Household Count')
# axes[0,1].set_xlabel('Total Household Count')
# axes[0,1].set_ylabel('Count')

# # # Bar plot of Room Count
# val_count = price_train_df["room_count"].dropna().value_counts() / 1000
# val_count = val_count.sort_index()
# axes[1,0].bar(val_count.index, val_count.values)
# axes[1,0].set(title='Bar Plot of Room Count')
# axes[1,0].set_xlabel('Room Count')
# axes[1,0].set_ylabel('Count (1,000)')

# # # Bar plot of Bathroom Count
# val_count = price_train_df["bathroom_count"].dropna().value_counts() / 1000
# val_count = val_count.sort_index()
# axes[1,1].bar(val_count.index, val_count.values)
# axes[1,1].set(title='Bar Plot of Bathroom Count')
# axes[1,1].set_xlabel('Bathroom Count')
# axes[1,1].set_ylabel('Count (1,000)')

# plt.tight_layout()
# plt.show()

In [20]:
print(f"The mean of room_count and bathroom_count are {price_train_df['room_count'].mean():.2f} and {price_train_df['bathroom_count'].mean():.2f} respectively")
print(f"The mode of room_count and bathroom_count are {price_train_df['room_count'].mode()[0]} and {price_train_df['bathroom_count'].mode()[0]} respectively")

The mean of room_count and bathroom_count are 2.95 and 1.59 respectively
The mode of room_count and bathroom_count are 3.0 and 2.0 respectively


## Multivariate Analysis

## Fields - city, latitude and longitude

In [21]:
# # Sample 1000 records from each class
# price_train_sample = price_train_df.groupby('city').apply(lambda x: x.sample(1000))

# # Scatter Plot of Latitude and Longitude with city labels
# scatter = plt.scatter(price_train_sample['longitude'], price_train_sample['latitude'], c=price_train_sample['city'], cmap='summer')
# plt.title('Latitude and Longitude of City')
# plt.xlabel('Longitude')
# plt.ylabel('Latitude')
# plt.legend(*scatter.legend_elements(), loc='upper right')
# plt.show()

## Fields - All Numeric Fields

In [29]:
# price_train_df.corr().style.background_gradient(cmap='coolwarm').set_precision(2).set_table_styles([dict(selector="th.col_heading",props=[("writing-mode", "vertical-rl")])])

## School Data Set (Schools.csv)

## Fields - school_class

In [70]:
# # Bar plot of School Class
# # Drop missing values (NA) and calculate the counts of each value
# val_count = school_df["school_class"].dropna().astype("str").value_counts()

# # Draw bar plot
# fig, ax = plt.subplots(figsize=(12, 6))
# plt.bar(val_count.index, val_count.values)
# plt.title('Bar Plot of School Class')
# plt.xlabel('School Class')
# plt.ylabel('Count')
# plt_add_labels(val_count.index, val_count.values)
# plt.xticks(range(len(val_count.index)), val_count.index)
# plt.show()

## Fields - operation_type

In [74]:
# # Bar plot of Operation Type of Schools
# # Drop missing values (NA) and calculate the counts of each value
# val_count = school_df["operation_type"].dropna().astype("str").value_counts()

# # Draw bar plot
# fig, ax = plt.subplots(figsize=(12, 6))
# plt.bar(val_count.index, val_count.values)
# plt.title('Bar Plot of Operation Type of Schools')
# plt.xlabel('Operation Type')
# plt.ylabel('Count')
# plt_add_labels(val_count.index, val_count.values)
# plt.xticks(range(len(val_count.index)), val_count.index)
# plt.show()

## Fields - highschool_type

In [1]:
# # Bar plot of High School Type
# # Drop missing values (NA) and calculate the counts of each value
# val_count = school_df["highschool_type"].dropna().astype("str").value_counts()

# # Draw bar plot
# fig, ax = plt.subplots(figsize=(12, 6))
# plt.bar(val_count.index, val_count.values)
# plt.title('Bar Plot of High School Type')
# plt.xlabel('High School Type')
# plt.ylabel('Count')
# plt_add_labels(val_count.index, val_count.values)
# plt.xticks(range(len(val_count.index)), val_count.index)
# plt.show()

In [None]:
school_df["gender"].dropna().astype("str").value_counts()

## Subway Data Set (Subways.csv)

In [23]:
subway_line_list = []
for subway_line in subway_df["subway_line"].drop_duplicates().tolist():
    subway_line_list += subway_line.split(",")

subway_line_list = list(set(subway_line_list))
subway_line_list.sort()
print(subway_line_list)

['1', '2', '3', '4', '5', '6', '7', '8', '9', 'AP', 'B1', 'B2', 'B3', 'B4', 'BD', 'BK', 'DL', 'KC', 'KJ', 'ND', 'US']


## Fields - latitude and longitude

In [24]:
# # Scatter Plot of Latitude and Longitude with city labels
# plt.scatter(school_df['longitude'], school_df['latitude'], color='blue', label='School')
# plt.scatter(subway_df['longitude'], subway_df['latitude'], color='green', label='Subway')
# plt.title('Latitude and Longitude of Schools and Subways')
# plt.xlabel('Longitude')
# plt.ylabel('Latitude')
# plt.legend(loc='upper right')
# plt.show()

## Missing Values and Outliers

In [25]:
print_missing_val_count(price_train_df)

The following columns have missing values:
total_parking_capacity_in_site has 91813 (5.7%) missing value.
tallest_building_in_sites has 9 (0.0%) missing value.
lowest_building_in_sites has 9 (0.0%) missing value.
heat_type has 2017 (0.1%) missing value.
heat_fuel has 9667 (0.6%) missing value.
room_count has 691 (0.0%) missing value.
bathroom_count has 691 (0.0%) missing value.
front_door_structure has 13892 (0.9%) missing value.


In [31]:
print_missing_val_count(school_df)

The following columns have missing values:
highschool_type has 1459 (76.0%) missing value.


In [28]:
print_missing_val_count(subway_df)

The following columns have missing values:
address_by_law has 9 (2.2%) missing value.
