In [6]:
import pandas as pd
import time
from datetime import datetime, timedelta
import json
import requests
import re
import matplotlib as plt
from bs4 import BeautifulSoup

In [7]:
### convert DMS coordinates to lat-long coordinates
def dms2dec(dms_str):
    dms_str = re.sub(r'\s', '', dms_str) 
    sign = -1 if re.search('[swSW]', dms_str) else 1  
    numbers = [*filter(len, re.split('\D+', dms_str, maxsplit=4))]
    degree = numbers[0]
    minute = numbers[1] if len(numbers) >= 2 else '0'
    second = numbers[2] if len(numbers) >= 3 else '0'
    frac_seconds = numbers[3] if len(numbers) >= 4 else '0'
    second += "." + frac_seconds
    return sign * (int(degree) + float(minute) / 60 + float(second) / 3600)

In [8]:
### function - crawling (GFZ) 
def get_rows_by_id(id):
    tables_gfz = requests.get(f'https://geofon.gfz-potsdam.de/eqinfo/list.php?page={id}')
    bs = BeautifulSoup(tables_gfz.content, 'html.parser')
    if 'No events matching' in tables_gfz.content.decode('utf-8'):
        return None
    magns, cords, dates, times = [], [], [], []
    for t in bs.find_all('span', attrs={'class':'magbox'}):
        magns.append(t.string.replace(' ', '').replace('\n', ''))
    for t in bs.find_all('div', attrs={'class':'col-xs-9 col-sm-9 thin-gutter'}):
        o = t.find('div', attrs={'class':'col-xs-12'})
        if o.has_attr('title'):
            cords.append(o['title'])
    for t in bs.find_all('div', attrs={'class':'col-xs-9 col-sm-9 thin-gutter'}):
        o = t.find_all('div', attrs={'class':'col-xs-12'})
        if 'Time' not in o[1].text:
            dates.append(o[1].text.split()[0])
            times.append(o[1].text.split()[1])
    df_gfz = pd.DataFrame({'Date': dates, 'Time': times, 'Cords': cords, 'Magnitude': magns})
    return df_gfz

In [9]:
### function - pages number (GFZ)
def get_number_of_pages():
    pgs = 2037 # Min
    tables_gfz = requests.get(f'https://geofon.gfz-potsdam.de/eqinfo/list.php?page={pgs}')
    while 'No events matching' not in tables_gfz.content.decode('utf-8'):
        tables_gfz = requests.get(f'https://geofon.gfz-potsdam.de/eqinfo/list.php?page={pgs}')
        pgs += 1
    return pgs

In [10]:
### function - get API from earthquake.usgs
def get_data(start_year, start_month, start_day, end_year, end_month, end_day):
    url = 'https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime='
    time = start_year+'-'+start_month+'-'+start_day+'&endtime='+end_year+'-'+end_month+'-'+end_day
    response = requests.get(url+time)
    response_j = response.content.decode('utf-8')
    response_d = json.loads(response_j)
    return response_d

In [11]:
### Crawling - Wikipedia
tables_wiki = pd.read_html("https://en.wikipedia.org/wiki/Lists_of_20th-century_earthquakes", match="Deaths")
df_wiki = pd.DataFrame()
for table in tables_wiki:
    df_wiki = pd.concat([df_wiki, table], ignore_index=True)
df_wiki = df_wiki[["Date", "Time", "Lat.", "Long.", "Magnitude", "Deaths"]]

### fix the data
df_wiki = df_wiki.rename(columns = {'Lat.':'Lat','Long.': 'Long' })

### fix date format -> YYYY-MM-DD
for i in range(len(df_wiki['Date'])):
    try:
        df_wiki['Date'][i] = datetime.strptime(df_wiki['Date'][i], '%B %d, %Y').strftime('%Y-%m-%d')
    except: 
        df_wiki = df_wiki.drop(i)  
        
### fix time format -> HH:MM
for i in range(len(df_wiki['Time'])):
    try:
        df_wiki['Time'][i] = df_wiki['Time'][i].split(' ')[0] 
    except: 
        pass
df_wiki = df_wiki.drop(300,axis=0)
df_wiki = df_wiki.reset_index(drop = True)

In [12]:
df_wiki

Unnamed: 0,Date,Time,Lat,Long,Magnitude,Deaths
0,1901-03-03,07:45,36.2,−120.7,6.4,
1,1901-03-31,07:10,,,7.2,4
2,1901-11-16,07:47,-42.42,173.18,6.9,
3,1902-09-19,,,,6.0,2
4,1902-04-19,02:23,14,−91,7.5,2000
...,...,...,...,...,...,...
363,2000-05-04,04:21,-1.105,123.573,7.6,46
364,2000-06-04,23:30,4.72,102.08,7.9,103
365,2000-07-07,13:30,11.884,-85.988,5.4,7
366,2000-11-16,04:54,−4,152.33,8,2


In [14]:
### import the pre downloaded file
df_gfz = pd.read_csv('Crawling_Eqs.csv')

### fix the data
df_gfz['Lat'] = [dms2dec(i.split(', ')[1]) for i in df_gfz['Cords']]
df_gfz['Long'] = [dms2dec(i.split(', ')[0]) for i in df_gfz['Cords']]
temp = df_gfz['Magnitude']
df_gfz = df_gfz.drop(['Cords','Magnitude'],axis =1)
df_gfz['Magnitude'] = temp

In [15]:
df_gfz

Unnamed: 0,Date,Time,Lat,Long,Magnitude
0,2020-12-31,11:52:07,-23.166667,-179.283333,4.3
1,2020-12-31,10:49:04,0.483333,98.283333,5.0
2,2020-12-31,10:18:54,-9.366667,122.416667,5.4
3,2020-12-31,08:15:46,45.800000,16.350000,3.6
4,2020-12-31,05:23:07,45.816667,16.283333,3.3
...,...,...,...,...,...
81468,2003-10-10,03:29:23,40.000000,97.833333,5.3
81469,2003-10-09,22:19:13,14.500000,120.500000,6.0
81470,2003-10-09,22:13:23,58.333333,-32.000000,5.4
81471,2003-10-09,16:06:07,50.333333,88.166667,5.1


In [13]:
### Crawling - GFZ
### Since the crawling process take about 30 min'
### we will use a pre downloaded source (to_csv)

i = 2
df = df_gfz = get_rows_by_id(1)
number_of_pages = get_number_of_pages()
while df_gfz is not None and i < 1000:
    df_gfz = get_rows_by_id(i)
    df = pd.concat([df, df_gfz], ignore_index=True)
    i += 1
    if not i % 10:
        print('{} Pages were collected ({:.2f}%)'.format(i, i / number_of_pages * 100))
print(df)

10 Pages were collected (0.49%)
20 Pages were collected (0.98%)


In [11]:
### API in earthquake.usgs.gov
### Since the crawling process take about 30 min'
### we will use a pre downloaded source (to_csv)
respond = []
specific_date = datetime(2003, 10, 9)
for i in range(1,6200):
    s_d = specific_date.day
    s_m = specific_date.month
    s_y = specific_date.year
    specific_date = specific_date + timedelta(1)
    e_d = specific_date.day
    e_m = specific_date.month
    e_y = specific_date.year
    respond.append(get_data(str(s_y), str(s_m), str(s_d), str(e_y), str(e_m), str(e_d)))
    #if not i % 10:
    print('{} Records were collected ({:.2f}%)'.format(i, i / 6200 * 100))

1 Records were collected (0.02%)
2 Records were collected (0.03%)


In [16]:
### function - extract data from json file
### we will skip this func too because its also take 20 min'
mag, lat, long, time, tsunami = [],[],[],[],[]
for i in respond:
     for j in range(len(i['features'])):
        mag.append(i['features'][j]['properties']['mag'])
        tsunami.append(i['features'][j]['properties']['tsunami'])
        time.append(i['features'][j]['properties']['time'])
        lat.append(i['features'][j]['geometry']['coordinates'][1])
        long.append(i['features'][j]['geometry']['coordinates'][0])
df_api = pd.DataFrame({'Time': time,'Lat': lat, 'Long':long, 'Mag':mag, 'Tsunami':tsunami })

In [17]:
df_api1 = pd.read_csv('API_Eqs_part1.csv')
df_api2 = pd.read_csv('API_Eqs_part2.csv')
df_api3 = pd.read_csv('API_Eqs_part3.csv')
df_api = pd.concat([df_api1,df_api2, df_api3], ignore_index=True)

### fix the data
df_api = df_api.rename(columns = {'Mag':'Magnitude'})
df_api=df_api.rename(columns = {'Time':'temp_Time'})
df_api['Date'] = [datetime.fromtimestamp(int(i)/1000).strftime('%Y-%m-%d %H:%M:%S').split(' ')[0] for i in df_api['temp_Time']]
df_api['Time'] = [datetime.fromtimestamp(int(i)/1000).strftime('%Y-%m-%d %H:%M:%S').split(' ')[1] for i in df_api['temp_Time']]

### reorder the columns
temp_Lat = df_api['Lat']
temp_Long = df_api['Long']
temp_Magnitude = df_api['Magnitude']
temp_Tsunami = df_api['Tsunami']
df_api = df_api.drop(['temp_Time', 'Lat', 'Long','Magnitude', 'Tsunami'],axis =1)
df_api['Lat'] = temp_Lat
df_api['Long'] = temp_Long
df_api['Magnitude'] = temp_Magnitude
df_api['Tsunami'] = temp_Tsunami

In [18]:
df_api

Unnamed: 0,Date,Time,Lat,Long,Magnitude,Tsunami
0,2003-11-28,08:13:20,13.202000,119.812000,0.00,0
1,2003-11-28,08:13:20,39.340000,-119.918200,0.70,0
2,2003-11-28,08:13:20,36.635667,-120.950000,1.23,0
3,2003-11-28,08:13:20,6.775000,126.381000,0.00,0
4,2003-11-28,08:13:20,38.590000,17.255000,2.90,0
...,...,...,...,...,...,...
2246688,2021-01-07,08:13:20,33.756333,-115.914000,1.58,0
2246689,2021-01-07,08:13:20,35.979515,-97.354088,1.52,0
2246690,2021-01-07,08:13:20,59.911800,-151.122100,1.60,0
2246691,2021-01-07,08:13:20,33.750333,-115.920833,1.50,0


In [20]:
### merge all DF's to one
merge_df = pd.concat([df_wiki,df_gfz,df_api], ignore_index=True)
merge_df.to_csv('merge.csv', index = False, chunksize=1000000)