In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
import findspark
findspark.init()

from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row
from pyspark.sql import SparkSession

In [2]:
pkmn= pd.read_csv('300k.csv')

In [3]:
pkmn[['city','latitude', 'longitude']].head(10)

Unnamed: 0,city,latitude,longitude
0,Mexico_City,20.525745,-97.460829
1,Mexico_City,20.523695,-97.461167
2,New_York,38.90359,-77.19978
3,Los_Angeles,47.665903,-122.312561
4,Los_Angeles,47.666454,-122.311628
5,Perth,-31.95498,115.853609
6,Perth,-31.954245,115.852038
7,Chicago,26.235257,-98.197591
8,Mexico_City,20.525554,-97.4588
9,New_York,32.928558,-84.340278


In [4]:
pk = pkmn.drop(['_id', 'cellId_90m', 'cellId_180m', 'cellId_370m', 'cellId_730m', 'cellId_1460m', 'cellId_2920m', 'cellId_5850m','gymIn100m', 'gymIn250m', 'gymIn500m', 'gymIn1000m', 'gymIn2500m', 'gymIn5000m', 'pokestopIn100m', 'pokestopIn250m', 'pokestopIn500m', 'pokestopIn1000m', 'pokestopIn2500m', 'pokestopIn5000m','appearedDayOfWeek'],1)

In [5]:
pk.head()

Unnamed: 0,pokemonId,latitude,longitude,appearedLocalTime,appearedTimeOfDay,appearedHour,appearedMinute,appearedDay,appearedMonth,appearedYear,...,cooc_143,cooc_144,cooc_145,cooc_146,cooc_147,cooc_148,cooc_149,cooc_150,cooc_151,class
0,16,20.525745,-97.460829,2016-09-08T03:57:45,night,5,57,8,8,2016,...,False,False,False,False,False,False,False,False,False,16
1,133,20.523695,-97.461167,2016-09-08T03:57:37,night,5,57,8,8,2016,...,False,False,False,False,False,False,False,False,False,133
2,16,38.90359,-77.19978,2016-09-08T03:57:25,night,5,57,8,8,2016,...,False,False,False,False,False,False,False,False,False,16
3,13,47.665903,-122.312561,2016-09-08T03:56:22,night,5,56,8,8,2016,...,False,False,False,False,False,False,False,False,False,13
4,133,47.666454,-122.311628,2016-09-08T03:56:08,night,5,56,8,8,2016,...,False,False,False,False,False,False,False,False,False,133


In [6]:
#solve the time problem
pk['appearedLocalTime'] =  pd.to_datetime(pk['appearedLocalTime'], format='%Y-%m-%dT%H:%M:%S') 
pk = pk.drop(['appearedHour', 'appearedMinute', 'appearedDay', 'appearedMonth', 'appearedYear'],1)
pk['appearedHour'] = pk['appearedLocalTime'].dt.hour
pk['appearedMinute'] = pk['appearedLocalTime'].dt.minute
pk['appearedDay'] = pk['appearedLocalTime'].dt.day
pk['appearedMonth'] = pk['appearedLocalTime'].dt.month
pk['appearedYear'] = pk['appearedLocalTime'].dt.year
pk = pk.drop(['appearedLocalTime'],1)

In [7]:
#Converting appearedTimeofDay into ordinal
time = {"morning": 1, "afternoon": 2, "evening": 3, "night": 4}
pk['appearedTimeOfDay'] = pk['appearedTimeOfDay'].map(time)

In [8]:
#redefining continents such that they correspond to the main 7 continents (no Antartica, yes Indian)
pk.loc[pk['continent']=='America/Indiana','continent']='America'
pk.loc[pk['continent']=='America/Kentucky','continent']='America'
pk.loc[pk['continent']=='Pacific','continent']='Australia'
pk.loc[pk['continent']=='Atlantic','continent']='Europe'
pk.loc[pk['continent']=='America/Argentina','continent']='America'
pk.loc[pk['continent']=='India','continent']='Asia'

In [9]:
cont = {'America':1, 'Australia': 2, 'Europe': 3, 'Asia': 4}
pk['continent'] = pk['continent'].map(cont)

In [10]:
pk.head(200)

Unnamed: 0,pokemonId,latitude,longitude,appearedTimeOfDay,terrainType,closeToWater,city,continent,weather,temperature,...,cooc_148,cooc_149,cooc_150,cooc_151,class,appearedHour,appearedMinute,appearedDay,appearedMonth,appearedYear
0,16,20.525745,-97.460829,4,14,False,Mexico_City,1.0,Foggy,25.5,...,False,False,False,False,16,3,57,8,9,2016
1,133,20.523695,-97.461167,4,14,False,Mexico_City,1.0,Foggy,25.5,...,False,False,False,False,133,3,57,8,9,2016
2,16,38.903590,-77.199780,4,13,False,New_York,1.0,Clear,24.2,...,False,False,False,False,16,3,57,8,9,2016
3,13,47.665903,-122.312561,4,0,True,Los_Angeles,1.0,PartlyCloudy,15.6,...,False,False,False,False,13,3,56,8,9,2016
4,133,47.666454,-122.311628,4,0,True,Los_Angeles,1.0,PartlyCloudy,15.6,...,False,False,False,False,133,3,56,8,9,2016
5,21,-31.954980,115.853609,4,13,False,Perth,2.0,PartlyCloudy,16.5,...,False,False,False,False,21,3,55,8,9,2016
6,66,-31.954245,115.852038,4,13,False,Perth,2.0,PartlyCloudy,16.5,...,False,False,False,False,66,3,55,8,9,2016
7,27,26.235257,-98.197591,4,13,False,Chicago,1.0,Clear,28.0,...,False,False,False,False,27,3,55,8,9,2016
8,35,20.525554,-97.458800,4,14,False,Mexico_City,1.0,Foggy,25.5,...,False,False,False,False,35,3,55,8,9,2016
9,19,32.928558,-84.340278,4,8,False,New_York,1.0,Clear,23.7,...,False,False,False,False,19,3,54,8,9,2016


In [11]:
pk['windBearing'].head()

0    269
1    269
2    218
3    160
4    160
Name: windBearing, dtype: int64

In [12]:
pk.loc[(pk['windBearing'] >= 400), 'windBearing'] = 0
pk.loc[(pk['windBearing'] < 24), 'windBearing'] = 0
pk.loc[(pk['windBearing'] >= 24) & (pk['windBearing'] <70), 'windBearing'] = 1
pk.loc[(pk['windBearing'] >= 70) & (pk['windBearing'] < 120), 'windBearing'] = 2
pk.loc[(pk['windBearing'] >= 120) & (pk['windBearing'] < 170), 'windBearing'] = 3
pk.loc[(pk['windBearing'] >= 170) & (pk['windBearing'] < 220), 'windBearing'] = 4
pk.loc[(pk['windBearing'] >= 220) & (pk['windBearing'] < 270), 'windBearing'] = 5
pk.loc[(pk['windBearing'] >= 270) & (pk['windBearing'] <320), 'windBearing'] = 6
pk.loc[(pk['windBearing'] >= 320) & (pk['windBearing'] < 400), 'windBearing'] = 7

In [13]:
weather = {'Foggy': 1, 'Clear': 2, 'PartlyCloudy': 3, 'MostlyCloudy': 4,"Overcast" : 5,"Rain" : 6,"BreezyandOvercast" : 7,"LightRain" : 8, "Drizzle" : 9,"BreezyandPartlyCloudy" : 10,"HeavyRain" : 11,"BreezyandMostlyCloudy" :12,
          "Breezy" : 13, "Windy": 14, "WindyandFoggy" : 15, "Humid" : 16, "Dry" : 17, "WindyandPartlyCloudy" : 18, "DangerouslyWindy":19, "DryandMostlyCloudy": 20, "DryandPartlyCloudy" : 21, "DrizzleandBreezy":22,"LightRainandBreezy" :23, "HumidandPartlyCloudy" :24, "HumidandOvercast" : 25,"RainandWindy" : 26}

In [14]:
pk['weather'] = pk['weather'].map(weather)
pk.head()

Unnamed: 0,pokemonId,latitude,longitude,appearedTimeOfDay,terrainType,closeToWater,city,continent,weather,temperature,...,cooc_148,cooc_149,cooc_150,cooc_151,class,appearedHour,appearedMinute,appearedDay,appearedMonth,appearedYear
0,16,20.525745,-97.460829,4,14,False,Mexico_City,1.0,1,25.5,...,False,False,False,False,16,3,57,8,9,2016
1,133,20.523695,-97.461167,4,14,False,Mexico_City,1.0,1,25.5,...,False,False,False,False,133,3,57,8,9,2016
2,16,38.90359,-77.19978,4,13,False,New_York,1.0,2,24.2,...,False,False,False,False,16,3,57,8,9,2016
3,13,47.665903,-122.312561,4,0,True,Los_Angeles,1.0,3,15.6,...,False,False,False,False,13,3,56,8,9,2016
4,133,47.666454,-122.311628,4,0,True,Los_Angeles,1.0,3,15.6,...,False,False,False,False,133,3,56,8,9,2016


In [15]:
pk.loc[pk['population_density'] < 200, 'population_density'] = 0
pk.loc[(pk['population_density'] >= 200) & (pk['population_density'] < 800), 'population_density'] = 1
pk.loc[(pk['population_density'] >= 800) & (pk['population_density'] < 1400), 'population_density'] = 2
pk.loc[(pk['population_density'] > 1400), 'population_density'] = 3
pk = pk.drop(['urban', 'suburban', 'midurban', 'rural'],1)

In [16]:
pk['population_density'].head()

0    3.0
1    3.0
2    1.0
3    3.0
4    3.0
Name: population_density, dtype: float64

In [17]:
pk.loc[pk['temperature'] < 10, 'temperature'] = 0
pk.loc[(pk['temperature'] >= 10) & (pk['temperature'] < 20), 'temperature'] = 1
pk.loc[(pk['temperature'] >= 20) & (pk['temperature'] < 30), 'temperature'] = 2
pk.loc[(pk['temperature'] >= 30)& (pk['temperature'] < 40), 'temperature'] = 3
pk.loc[(pk['temperature'] > 40), 'temperature'] = 4

In [18]:
pk['temperature'].head()

0    2.0
1    2.0
2    2.0
3    1.0
4    1.0
Name: temperature, dtype: float64

In [19]:
pk.loc[pk['windSpeed'] < 1, 'windSpeed'] = 0
pk.loc[(pk['windSpeed'] >= 1) & (pk['windSpeed'] < 5), 'windSpeed'] = 1
pk.loc[(pk['windSpeed'] >= 5) & (pk['windSpeed'] < 10), 'windSpeed'] = 2
pk.loc[(pk['windSpeed'] >= 10)& (pk['windSpeed'] < 15), 'windSpeed'] = 3
pk.loc[(pk['windSpeed'] >= 15)& (pk['windSpeed'] < 20), 'windSpeed'] = 4
pk.loc[(pk['windSpeed'] >= 20)& (pk['windSpeed'] < 25), 'windSpeed'] = 5
pk.loc[(pk['windSpeed'] >= 25)& (pk['windSpeed'] < 30), 'windSpeed'] = 6
pk.loc[(pk['windSpeed'] >= 30)& (pk['windSpeed'] < 35), 'windSpeed'] = 7
pk.loc[(pk['windSpeed'] > 35), 'windSpeed'] = 8

In [20]:
pk['windSpeed'].head()

0    1.0
1    1.0
2    1.0
3    2.0
4    2.0
Name: windSpeed, dtype: float64

In [21]:
pk.loc[pk['pressure'] < 1000, 'pressure'] = 0
pk.loc[(pk['pressure'] >= 1000) & (pk['pressure'] < 1005), 'pressure'] = 1
pk.loc[(pk['pressure'] >= 1005) & (pk['pressure'] < 1010), 'pressure'] = 2
pk.loc[(pk['pressure'] >= 1010)& (pk['pressure'] < 1015), 'pressure'] = 3
pk.loc[(pk['pressure'] >= 1015)& (pk['pressure'] < 1020), 'pressure'] = 4
pk.loc[(pk['pressure'] >= 1020)& (pk['pressure'] < 1025), 'pressure'] = 5
pk.loc[(pk['pressure'] >= 1025)& (pk['pressure'] < 1030), 'pressure'] = 6
pk.loc[(pk['pressure'] > 1030), 'pressure'] = 7

In [22]:
pk['closeToWater'] = pk['closeToWater'].astype('category')
pk['closeToWater'] = pk['closeToWater'].cat.codes

In [23]:
select = ['class','latitude', 'longitude','city','appearedHour','appearedMinute','appearedTimeOfDay','terrainType','pressure','windBearing','weather','population_density','temperature','windSpeed','closeToWater']

In [24]:
data = pk.loc[:,select]

In [25]:
from sklearn.cross_validation import train_test_split

In [26]:
features = data.drop(['latitude','longitude','city'],1)
labels = pd.concat([data['latitude'], data['longitude']],axis = 1)

In [27]:
train_feature, test_feature,train_label, test_label = train_test_split(features, labels, train_size = 0.7,random_state = 46)

In [28]:
test_label.iloc[:,[0]]

Unnamed: 0,latitude
208428,49.962754
161584,45.514116
243431,49.847894
137138,49.644354
43466,32.905209
80368,53.727019
153701,44.576401
271832,49.483441
109529,48.887934
201904,43.723587


In [29]:
train_feature.isnull().sum()

class                 0
appearedHour          0
appearedMinute        0
appearedTimeOfDay     0
terrainType           0
pressure              0
windBearing           0
weather               0
population_density    0
temperature           0
windSpeed             0
closeToWater          0
dtype: int64

In [30]:
from sklearn import ensemble

In [31]:
model_GradientBoostingRegressor = ensemble.GradientBoostingRegressor(max_depth=10,n_estimators=100,learning_rate=0.7,verbose=1)
model = model_GradientBoostingRegressor
model.fit(train_feature,train_label.iloc[:,[0]])
score = model.score(test_feature,test_label.iloc[:,[0]])
result = model.predict(test_feature)
print(score)

      Iter       Train Loss   Remaining Time 
         1         111.2774            7.13m
         2          75.6403            6.85m
         3          53.9645            7.20m
         4          49.4695            7.02m
         5          41.9584            7.03m
         6          32.1886            7.19m
         7          30.7217            6.81m
         8          29.1536            6.78m
         9          26.3956            6.81m
        10          25.0641            6.78m
        20          14.1599            5.94m
        30           9.3750            5.15m
        40           7.4270            4.29m
        50           6.0016            3.60m
        60           4.8693            2.87m
        70           4.0691            2.15m
        80           3.4321            1.44m
        90           3.0412           43.01s
       100           2.6685            0.00s
0.9428171170714121


In [32]:
model_GradientBoostingRegressor = ensemble.GradientBoostingRegressor(max_depth=10,n_estimators=100,learning_rate=0.7,verbose=1)
model = model_GradientBoostingRegressor
model.fit(train_feature,train_label.iloc[:,[1]])
score = model.score(test_feature,test_label.iloc[:,[1]])
result_longitude = model.predict(test_feature)
print(score)

      Iter       Train Loss   Remaining Time 
         1        1696.9952            7.34m
         2        1082.1340            7.44m
         3         806.2381            7.46m
         4         693.3527            7.32m
         5         612.3431            7.02m
         6         508.8254            7.00m
         7         471.8031            6.73m
         8         417.3699            6.65m
         9         402.1630            6.63m
        10         372.6903            6.45m
        20         214.5787            5.45m
        30         131.0001            4.94m
        40         101.7457            4.30m
        50          83.2694            3.61m
        60          71.5559            2.86m
        70          59.4545            2.16m
        80          51.8432            1.44m
        90          43.2903           43.53s
       100          37.5246            0.00s
0.9582321964756763


In [35]:
latitude_result = pd.DataFrame(result)
longitude_result = pd.DataFrame(result_longitude)
latitude_result.to_csv('../final project/latitude_result.csv')
longitude_result.to_csv('../final project/longitude_result.csv')