In [1]:
import pandas as pd
import numpy as np
import os, re, random
import warnings, json
warnings.filterwarnings('ignore')
from collections import defaultdict

# PLEASE change your data folder here manually if you need
current_folder = os.path.abspath(os.curdir)
root_folder = os.path.dirname(current_folder) 
data_folder = os.path.join(root_folder,'data')

%matplotlib inline

def set_seeds(seed):
    # for reproducibility
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    
set_seeds(1234)

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [2]:
train_df = pd.read_csv(os.path.join(data_folder,'train.csv'))
# test_df = pd.read_csv(os.path.join(data_folder,'test.csv'))
train_df.columns

Index(['month', 'town', 'flat_type', 'block', 'street_name', 'storey_range',
       'floor_area_sqm', 'flat_model', 'eco_category', 'lease_commence_date',
       'latitude', 'longitude', 'elevation', 'subzone', 'planning_area',
       'region', 'resale_price'],
      dtype='object')

In [3]:
auxiliary_paths = {
    'commercial':'sg-commerical-centres.csv',
    'hawker': 'sg-gov-markets-hawker-centres.csv',
    'demographics': 'sg-population-demographics.csv',
    'prisch': 'sg-primary-schools.csv',
    'secsch': 'sg-secondary-schools.csv',
    'malls': 'sg-shopping-malls.csv',
    'station': 'sg-train-stations.csv'
}

In [5]:
aux = 'prisch'
print(f'Opening auxiliary data bout "{aux}"...')
aux_df = pd.read_csv(os.path.join(os.path.join(data_folder, 'auxiliary-data'), auxiliary_paths[aux]))
aux_df.head()

Opening auxiliary data bout "prisch"...


Unnamed: 0,name,lat,lng
0,Admiralty Primary School,1.442941,103.800345
1,Ahmad Ibrahim Primary School,1.433849,103.83271
2,Ai Tong School,1.360713,103.833018
3,Alexandra Primary School,1.291284,103.824066
4,Anchor Green Primary School,1.39038,103.887354


In [31]:
path = r"D:\29 CS5228\03 Project\HDBResalePrice\data\auxiliary-data\prischrank.xlsx"
rank0 = pd.read_excel(path, sheet_name='kiasu', engine='openpyxl', usecols=['School', 'Index']).sort_values(by='Index', ascending=False).reset_index(drop=True)
rank0.head()

Unnamed: 0,School,Index
0,Nanyang Primary,7.2
1,Raffles Girls’ Primary,7.0
2,Rosyth,6.582
3,Nan Hua Primary,6.536
4,Tao Nan,6.455


In [32]:
year = 2020
rank1 = pd.read_excel(path, sheet_name=str(year), engine='openpyxl')
rank1[f'{year}over'] = rank1['Registered']/rank1['Vacancy']
rank1 = rank1.sort_values(by=f'{year}over', ascending=False).reset_index(drop=True)
rank1.head()

Unnamed: 0,School,Vacancy,Registered,2020over
0,Princess Elizabeth Primary School,38,226,5.947368
1,South View Primary School,26,139,5.346154
2,Nan Hua Primary School,22,115,5.227273
3,Rosyth School,20,94,4.7
4,Northland Primary School,49,215,4.387755


In [20]:
# find closest match
fuzzy_threshold = 90
# fuzzy_scores = []

# for ix, place in enumerate(places):
# get fuzzy score
sch = aux_df['name'][0]
fuzzy_scores = rank0['School'].apply(lambda x: fuzz.token_sort_ratio(x, sch))
max_ix = np.argmax(fuzzy_scores)

if fuzzy_scores[max_ix]<fuzzy_threshold:
    print(f'Score {fuzzy_scores[max_ix]}: for "{sch}", found "{rank0.School[max_ix]}"')

Score 83: for "Admiralty Primary School", found "Admiralty Primary"


In [40]:
# find closest match
fuzzy_threshold = 50
aux_df['KiasuRank'] = np.nan
for row in rank0[0:50].iterrows():    
    sch = row[1]['School']
    fuzzy_scores = aux_df['name'].apply(lambda x: fuzz.token_sort_ratio(x, sch))
    max_ix = np.argmax(fuzzy_scores)
    if fuzzy_scores[max_ix]<fuzzy_threshold:
        # schools with issues
        matched_sch = aux_df['name'][max_ix]
        print(f'Score {fuzzy_scores[max_ix]}, ix {max_ix}: for "{sch}", found "{matched_sch}"')
    aux_df.loc[max_ix, 'KiasuRank'] = row[1]['Index']
aux_df.head()

Unnamed: 0,name,lat,lng,KiasuRank
0,Admiralty Primary School,1.442941,103.800345,
1,Ahmad Ibrahim Primary School,1.433849,103.83271,
2,Ai Tong School,1.360713,103.833018,4.382
3,Alexandra Primary School,1.291284,103.824066,
4,Anchor Green Primary School,1.39038,103.887354,


In [42]:
for year in [2020, 2019, 2018, 2017]:
    
    rank1 = pd.read_excel(path, sheet_name=str(year), engine='openpyxl')
    rank1['Index'] = rank1['Registered']/rank1['Vacancy']
    rank1 = rank1.sort_values(by='Index', ascending=False).reset_index(drop=True)

    aux_df[f'{year}over'] = np.nan
    for row in rank1[0:50].iterrows():    
        sch = row[1]['School']
        fuzzy_scores = aux_df['name'].apply(lambda x: fuzz.token_sort_ratio(x, sch))
        max_ix = np.argmax(fuzzy_scores)
        if fuzzy_scores[max_ix]<fuzzy_threshold:
            # schools with issues
            matched_sch = aux_df['name'][max_ix]
            print(f'Score {fuzzy_scores[max_ix]}, ix {max_ix}: for "{sch}", found "{matched_sch}"')
        aux_df.loc[max_ix, f'{year}over'] = row[1]['Index']

aux_df.head()

Unnamed: 0,name,lat,lng,KiasuRank,2020over,2019over,2018over,2017over
0,Admiralty Primary School,1.442941,103.800345,,,1.409836,1.891304,
1,Ahmad Ibrahim Primary School,1.433849,103.83271,,,,,
2,Ai Tong School,1.360713,103.833018,4.382,3.2,1.833333,2.727273,1.363636
3,Alexandra Primary School,1.291284,103.824066,,,,,1.267717
4,Anchor Green Primary School,1.39038,103.887354,,,,,


In [55]:
aux_df.columns

Index(['name', 'lat', 'lng', 'KiasuRank', '2020over', '2019over', '2018over',
       '2017over'],
      dtype='object')

In [58]:
aux_df['Top50'] = ['' if i>0 else None for i in aux_df[['KiasuRank', '2020over', '2019over', '2018over','2017over']].sum(axis=1)]
aux_df.head()

Unnamed: 0,name,lat,lng,KiasuRank,2020over,2019over,2018over,2017over,Top50
0,Admiralty Primary School,1.442941,103.800345,,,1.409836,1.891304,,
1,Ahmad Ibrahim Primary School,1.433849,103.83271,,,,,,
2,Ai Tong School,1.360713,103.833018,4.382,3.2,1.833333,2.727273,1.363636,
3,Alexandra Primary School,1.291284,103.824066,,,,,1.267717,
4,Anchor Green Primary School,1.39038,103.887354,,,,,,


In [61]:
aux_df.to_csv(r"D:\29 CS5228\03 Project\HDBResalePrice\data\auxiliary-data\sg-primary-schools-wranks.csv", index=False)