In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import copy
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
#from pyspark import SparkContext, SparkConf
#sc =SparkContext()
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [12]:
earthquake_data = pd.read_csv("earthquake_data.csv")
print(earthquake_data.columns)
earthquake_data.head(3)

Index(['ISO', 'Country', 'latitude', 'longitude', 'depth', 'mag', 'year',
       'month', 'day', 'Total affected', 'Total deaths',
       'Total damage ('000 US$)', '2017'],
      dtype='object')


Unnamed: 0,ISO,Country,latitude,longitude,depth,mag,year,month,day,Total affected,Total deaths,Total damage ('000 US$),2017
0,AFG,Afghanistan,35.169,69.389,62.1,5.0,2001.0,6.0,1.0,270,4,0,0.498
1,AFG,Afghanistan,36.429,70.438,209.0,6.3,2002.0,3.0,3.0,3513,150,0,0.498
2,AFG,Afghanistan,33.426,69.524,10.0,5.2,2004.0,7.0,18.0,1040,2,0,0.498


In [13]:
population_data = pd.read_csv("population_data.csv")
population_density_data = population_data.drop(['Unnamed: 0','UN_2000_DS','UN_2005_DS',
                                                'UN_2010_DS','UN_2015_DS','UN_2020_DS'],axis=1)
print(population_density_data.shape)
population_density_data['index']=population_density_data.index
population_density_data.head(3)

(2378877, 9)


Unnamed: 0,ISOALPHA,NAME1,CENTROID_X,CENTROID_Y,UN_2000_E,UN_2005_E,UN_2010_E,UN_2015_E,UN_2020_E,index
0,ITA,Marche,13.452,43.288,93,95,95,95,94,0
1,CAN,Quebec / Quebec,-71.288,46.971,23,29,36,44,53,1
2,ITA,Veneto,12.387,44.922,99,97,93,89,84,2


In [4]:
population_density_data.columns

Index(['ISOALPHA', 'NAME1', 'CENTROID_X', 'CENTROID_Y', 'UN_2000_E',
       'UN_2005_E', 'UN_2010_E', 'UN_2015_E', 'UN_2020_E', 'index'],
      dtype='object')

In [14]:
def myround(x, base=5):
    return int(base * round(float(x)/base))

def get_coords_diff(lat_fix, long_fix, lat, long):
    x = (lat-lat_fix)**2 + (long-long_fix)**2
    return x

def get_pop(iso, lat, long, year, lat_range, long_range, pop_data):
    population_data = pop_data[pop_data['ISOALPHA']==iso]
    lat_range = (population_data['CENTROID_Y']>(lat-lat_range)) & (population_data['CENTROID_Y']<=(lat+lat_range))
    long_range = (population_data['CENTROID_X']>(long-long_range)) & (population_data['CENTROID_X']<=(long+long_range))
    population_data = population_data[lat_range&long_range]
    rounded_year = myround(year)
    year_col = "UN_{}_E".format(rounded_year)
    pop_count = population_data[year_col].values.sum()
    return pop_count

def get_pop2(iso, lat, long, year, pop_data):
    population_data = pop_data[pop_data['ISOALPHA']==iso]
    range_val = 0.5
    lat_range = (population_data['CENTROID_Y']>(lat-range_val)) & (population_data['CENTROID_Y']<=(lat+range_val))
    long_range = (population_data['CENTROID_X']>(long-range_val)) & (population_data['CENTROID_X']<=(long+range_val))
    population_data = population_data[lat_range&long_range]
    diff_list = {row['index']:get_coords_diff(
        lat,long,row['CENTROID_Y'],row['CENTROID_X']) for index, row in population_data.iterrows()}
    if not(bool(diff_list)):
        pop_ds = np.nan
    else:
        min_index = min(diff_list, key=diff_list.get)
        rounded_year = myround(year)
        year_col = "UN_{}_E".format(rounded_year)
        matched_row = population_data[(population_data['index']==min_index)]
        pop_ds = matched_row[year_col].item()
    return pop_ds


In [15]:
earthquake_data['pop_count'] = earthquake_data.apply(lambda x: get_pop(
    x['ISO'],x['latitude'],x['longitude'],x['year'],0.5,0.5,population_density_data), axis=1)

In [17]:
unmerged_data = earthquake_data[earthquake_data['pop_count']==0]
print(unmerged_data.shape)
unmerged_data['pop_count'] = unmerged_data.apply(lambda x: get_pop(
    x['ISO'],x['latitude'],x['longitude'],x['year'],1,1,population_density_data), axis=1)

(71, 14)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [18]:
unmerged_data[unmerged_data['pop_count']==0].shape

(39, 14)

In [20]:
unmerged_data.to_csv("unmerged_pop_data.csv",index=False)

In [19]:
earthquake_data

Unnamed: 0,ISO,Country,latitude,longitude,depth,mag,year,month,day,Total affected,Total deaths,Total damage ('000 US$),2017,pop_count
0,AFG,Afghanistan,35.169,69.389,62.100,5.000,2001.000,6.000,1.000,270,4,0,0.498,1089272
1,AFG,Afghanistan,36.429,70.438,209.000,6.300,2002.000,3.000,3.000,3513,150,0,0.498,228021
2,AFG,Afghanistan,33.426,69.524,10.000,5.200,2004.000,7.000,18.000,1040,2,0,0.498,942340
3,AFG,Afghanistan,34.539,73.588,26.000,7.600,2005.000,10.000,8.000,0,1,50,0.498,0
4,AFG,Afghanistan,37.255,68.828,34.200,5.600,2006.000,7.000,29.000,935,1,0,0.498,749168
5,AFG,Afghanistan,35.633,67.658,13.000,5.600,2010.000,4.000,18.000,1070,11,0,0.498,276305
6,DZA,Algeria,36.964,3.634,12.000,6.800,2003.000,5.000,21.000,210261,2266,5000000,0.754,2527079
7,DZA,Algeria,36.939,3.578,8.000,5.800,2003.000,5.000,27.000,200,9,0,0.754,3150275
8,DZA,Algeria,36.852,3.418,10.000,4.500,2004.000,1.000,10.000,300,0,0,0.754,4412169
9,DZA,Algeria,36.848,3.448,10.000,4.500,2004.000,12.000,1.000,15,0,0,0.754,4091222
