In [1]:
import tabula
import pandas as pd
import re
from json import load, dump

In [2]:
# Convert neat pdf to neater dataframe
raw_df = pd.DataFrame(tabula.read_pdf("deanlist_2017_fall.pdf", pages="all"))
raw_df.columns = ["name", "city", "country", "school", "list"]

In [3]:
# Reserve an original copy for reference
df = raw_df.copy()

# Clean the dataframe
df["city"] = df["city"].apply(lambda city: re.split('\W+', city.lower())[0])

In [4]:
df.head()

Unnamed: 0,name,city,country,school,list
0,Avery Ao,taiyuan,China,Colege of Agricultural and Life Sciences,Dean's List
1,Dongrui Bai,xian,China,Colege of Letters & Science,Dean's List
2,Tony Bai,beijing,China,Colege of Engineering,Dean's Honor List
3,Yuchen Bai,beijing,China,Colege of Letters & Science,Dean's List
4,Xinya Bi,jinan,China,School of Business,Dean's List


In [5]:
print(len(set(df["city"])))
set(df["city"])

105


{'anqing',
 'anshan',
 'baoding',
 'beijing',
 'beijng',
 'binzhou',
 'changchun',
 'changde',
 'changsha',
 'changshu',
 'changzhou',
 'chengdu',
 'chongqing',
 'dalian',
 'daqing',
 'dongiyng',
 'dunhuang',
 'foshan',
 'fuding',
 'fuzhou',
 'gaoyou',
 'guangzhou',
 'gui',
 'guilin',
 'guiyang',
 'haikou',
 'hangzhou',
 'harbin',
 'hefei',
 'hohhot',
 'huizhou',
 'jianyang',
 'jiaxing',
 'jinan',
 'jinhua',
 'jining',
 'jinzhou',
 'kaifeng',
 'karamay',
 'kunming',
 'kunshan',
 'langfang',
 'lanzhou',
 'leshan',
 'liaoyang',
 'linyi',
 'loudi',
 'luoyang',
 'luzhou',
 'maanshan',
 'meishan',
 'mianyang',
 'nanchang',
 'nanjing',
 'nanning',
 'nantong',
 'nanyang',
 'ningbo',
 'pudong',
 'qingdao',
 'quanzhou',
 'quzhou',
 'sh',
 'shanghai',
 'shantou',
 'shaoxing',
 'shengzhou',
 'shenyang',
 'shenzhen',
 'shijiazhuang',
 'shouguang',
 'suqian',
 'suzhou',
 'taian',
 'taiyuan',
 'taiyuanshi',
 'tangshan',
 'tianjin',
 'tianshui',
 'urumqi',
 'weifang',
 'weihai',
 'wenzhou',
 'wuhan',

In [6]:
# Some city names are not still not cleaned, such as `sh`
# Those case are still minor, so we can fix it manually
city_coord = load(open("china_coord.json", "r"))
for c in df["city"]:
    if c not in city_coord:
        print(c)

beijng
pudong
xi
dongiyng
sh
xi
xi
xi
xi
zhongqing
yongtaizhuang
xi
gui
xi
xi
taiyuanshi
taiyuanshi


In [7]:
df.drop(df.index[df['city'] == "sh"], inplace=True)
df.loc[df.index[df['city'] == "beijng"][0], 'city'] = "beijing"
df.loc[df.index[df['city'] == "xi"], 'city'] = "xi'an"
df.loc[df.index[df['city'] == "pudong"], 'city'] = "shanghai"
df.loc[df.index[df['city'] == "dongiyng"], 'city'] = "dongying"
df.loc[df.index[df['city'] == "zhongqing"], 'city'] = "chongqing"
df.loc[df.index[df['city'] == "yongtaizhuang"], 'city'] = "beijing"
df.loc[df.index[df['city'] == "gui"], 'city'] = "guiyang"
df.loc[df.index[df['city'] == "taiyuanshi"], 'city'] = "taiyuan"

In [8]:
# Add latitude and longtitude info the dataframe
lat, long = [], []
for index, row in df.iterrows():
    df.loc[index, "lat"] = city_coord[row["city"]]["coord"][0]
    df.loc[index, "long"] = city_coord[row["city"]]["coord"][1]

In [9]:
df.head()

Unnamed: 0,name,city,country,school,list,lat,long
0,Avery Ao,taiyuan,China,Colege of Agricultural and Life Sciences,Dean's List,37.8903,112.5509
1,Dongrui Bai,xian,China,Colege of Letters & Science,Dean's List,34.2778,108.9531
2,Tony Bai,beijing,China,Colege of Engineering,Dean's Honor List,39.93,116.3956
3,Yuchen Bai,beijing,China,Colege of Letters & Science,Dean's List,39.93,116.3956
4,Xinya Bi,jinan,China,School of Business,Dean's List,36.6828,117.025


In [10]:
# Write the dataframe to a csv
df.to_csv("deanlist_2017_fall.csv", index=False)