<a href="https://colab.research.google.com/github/tanyarw/disaster-prediction-backend/blob/main/Disaster_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Mount content

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
os.chdir("drive/My Drive/DV: Disaster Prediction")

Import Libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# **GLOBAL LANDSLIDES**

**TARGET:** PREDICT FATALITY COUNT

Read Dataset

In [4]:
landslide_df = pd.read_csv('Datasets/NASA_Global_Landslide_Catalog.csv')
landslide_df.head()

Unnamed: 0,source_name,source_link,event_id,event_date,event_time,event_title,event_description,location_description,location_accuracy,landslide_category,landslide_trigger,landslide_size,landslide_setting,fatality_count,injury_count,storm_name,photo_link,notes,event_import_source,event_import_id,country_name,country_code,admin_division_name,admin_division_population,gazeteer_closest_point,gazeteer_distance,submitted_date,created_date,last_edited_date,longitude,latitude
0,AGU,https://blogs.agu.org/landslideblog/2008/10/14...,684,08/01/2008 12:00:00 AM,,"Sigou Village, Loufan County, Shanxi Province","occurred early in morning, 11 villagers buried...","Sigou Village, Loufan County, Shanxi Province",unknown,landslide,rain,large,mine,11.0,,,,,glc,684.0,China,CN,Shaanxi,0.0,Jingyang,41.02145,04/01/2014 12:00:00 AM,11/20/2017 03:17:00 PM,02/15/2018 03:51:00 PM,107.45,32.5625
1,Oregonian,http://www.oregonlive.com/news/index.ssf/2009/...,956,01/02/2009 02:00:00 AM,,"Lake Oswego, Oregon",Hours of heavy rain are to blame for an overni...,"Lake Oswego, Oregon",5km,mudslide,downpour,small,unknown,0.0,,,,,glc,956.0,United States,US,Oregon,36619.0,Lake Oswego,0.60342,04/01/2014 12:00:00 AM,11/20/2017 03:17:00 PM,02/15/2018 03:51:00 PM,-122.663,45.42
2,CBS News,https://www.cbsnews.com/news/dozens-missing-af...,973,01/19/2007 12:00:00 AM,,"San Ramon district, 195 miles northeast of the...",(CBS/AP) At least 10 people died and as many a...,"San Ramon district, 195 miles northeast of the...",10km,landslide,downpour,large,unknown,10.0,,,,,glc,973.0,Peru,PE,Junín,14708.0,San Ramón,0.85548,04/01/2014 12:00:00 AM,11/20/2017 03:17:00 PM,02/15/2018 03:51:00 PM,-75.3587,-11.1295
3,Reuters,https://in.reuters.com/article/idINIndia-41450...,1067,07/31/2009 12:00:00 AM,,Dailekh district,"One person was killed in Dailekh district, pol...",Dailekh district,unknown,landslide,monsoon,medium,unknown,1.0,,,,,glc,1067.0,Nepal,NP,Mid Western,20908.0,Dailekh,0.75395,04/01/2014 12:00:00 AM,11/20/2017 03:17:00 PM,02/15/2018 03:51:00 PM,81.708,28.8378
4,The Freeman,http://www.philstar.com/cebu-news/621414/lands...,2603,10/16/2010 12:00:00 PM,,sitio Bakilid in barangay Lahug,Another landslide in sitio Bakilid in barangay...,sitio Bakilid in barangay Lahug,5km,landslide,tropical_cyclone,medium,unknown,0.0,,Supertyphoon Juan (Megi),,,glc,2603.0,Philippines,PH,Central Visayas,798634.0,Cebu City,2.02204,04/01/2014 12:00:00 AM,11/20/2017 03:17:00 PM,02/15/2018 03:51:00 PM,123.8978,10.3336


Drop unwanted features

In [5]:
landslide_df = landslide_df.drop(['source_name', 'source_link','event_id', 'event_date','event_time',
                        'event_title', 'event_description', 'location_description','storm_name','photo_link',
                        'notes', 'event_import_source','event_import_id','country_code','submitted_date', 
                        'created_date', 'last_edited_date','admin_division_name','gazeteer_closest_point', 'gazeteer_distance','injury_count'], axis = 1)

Drop unknown categories

In [6]:
to_remove = landslide_df[ (landslide_df['landslide_category'] == 'unknown') ].index
landslide_df = landslide_df.drop(to_remove)
to_remove = landslide_df[(landslide_df['location_accuracy'] == 'unknown')].index
landslide_df = landslide_df.drop(to_remove)

In [7]:
landslide_df = landslide_df.dropna(subset=['location_accuracy', 'landslide_category','landslide_trigger','landslide_size','landslide_setting','country_name'])

Determine feature and target vectors

In [8]:
X_features = list(landslide_df.columns)
X_features.remove('fatality_count')
X = landslide_df[X_features]
y = landslide_df['fatality_count']
y = y.fillna(y.median()) # deal with na

Encoding of categorical data

In [9]:
categorical = []
for i in X_features:
    if landslide_df[i].dtype=="object":
        categorical.append(i)

In [10]:
from sklearn import preprocessing
label_maps = {}
for i in categorical:
    le = preprocessing.LabelEncoder().fit(X[i])
    X[i]=le.transform(X[i])
    d = dict(zip(le.classes_, le.transform(le.classes_)))
    label_maps[i] = d
print(label_maps)

{'location_accuracy': {'100km': 0, '10km': 1, '1km': 2, '250km': 3, '25km': 4, '50km': 5, '5km': 6, 'exact': 7}, 'landslide_category': {'complex': 0, 'creep': 1, 'debris_flow': 2, 'earth_flow': 3, 'lahar': 4, 'landslide': 5, 'mudslide': 6, 'other': 7, 'riverbank_collapse': 8, 'rock_fall': 9, 'snow_avalanche': 10, 'translational_slide': 11}, 'landslide_trigger': {'construction': 0, 'continuous_rain': 1, 'dam_embankment_collapse': 2, 'downpour': 3, 'earthquake': 4, 'flooding': 5, 'freeze_thaw': 6, 'mining': 7, 'monsoon': 8, 'no_apparent_trigger': 9, 'other': 10, 'rain': 11, 'snowfall_snowmelt': 12, 'tropical_cyclone': 13, 'unknown': 14, 'volcano': 15}, 'landslide_size': {'large': 0, 'medium': 1, 'small': 2, 'unknown': 3, 'very_large': 4}, 'landslide_setting': {'above_river': 0, 'above_road': 1, 'below_road': 2, 'bluff': 3, 'burned_area': 4, 'deforested_slope': 5, 'engineered_slope': 6, 'mine': 7, 'natural_slope': 8, 'other': 9, 'retaining_wall': 10, 'unknown': 11, 'urban': 12}, 'country_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [11]:
X.head()

Unnamed: 0,location_accuracy,landslide_category,landslide_trigger,landslide_size,landslide_setting,country_name,admin_division_population,longitude,latitude
1,6,6,3,2,11,130,36619.0,-122.663,45.42
2,1,5,3,0,11,93,14708.0,-75.3587,-11.1295
4,6,5,13,1,11,94,798634.0,123.8978,10.3336
5,6,5,3,1,11,94,2404.0,124.9668,10.7004
6,6,6,3,2,11,130,2126.0,-117.2665,48.2797


Train and test split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

Perform regression

In [13]:
clf = RandomForestRegressor(n_estimators=150, max_depth = None, criterion='mse')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

Evaluation

In [14]:
mean_absolute_error(y_test, y_pred)

3.573051843533596

# **INDIAN RAINFALL**

**TARGET:** PREDICT NEXT MONTH RAIN

Read dataset

In [15]:
rainfall_df = pd.read_csv('Datasets/rainfall_india_1901-2017.csv')

In [16]:
rainfall_df.head()

Unnamed: 0,SUBDIVISION,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,ANNUAL,JF,MAM,JJAS,OND
0,Andaman & Nicobar Islands,1901,49.2,87.1,29.2,2.3,528.8,517.5,365.1,481.1,332.6,388.5,558.2,33.6,3373.2,136.3,560.3,1696.3,980.3
1,Andaman & Nicobar Islands,1902,0.0,159.8,12.2,0.0,446.1,537.1,228.9,753.7,666.2,197.2,359.0,160.5,3520.7,159.8,458.3,2185.9,716.7
2,Andaman & Nicobar Islands,1903,12.7,144.0,0.0,1.0,235.1,479.9,728.4,326.7,339.0,181.2,284.4,225.0,2957.4,156.7,236.1,1874.0,690.6
3,Andaman & Nicobar Islands,1904,9.4,14.7,0.0,202.4,304.5,495.1,502.0,160.1,820.4,222.2,308.7,40.1,3079.6,24.1,506.9,1977.6,571.0
4,Andaman & Nicobar Islands,1905,1.3,0.0,3.3,26.9,279.5,628.7,368.7,330.5,297.0,260.7,25.4,344.7,2566.7,1.3,309.7,1624.9,630.8


Deal with NaN values

In [17]:
rainfall_df.fillna(value = 0, inplace = True)

In [18]:
from sklearn.preprocessing import StandardScaler

rainfall_df[['JAN',	'FEB',	'MAR', 'APR',	'MAY',	'JUN',	'JUL',	'AUG',	'SEP',	'OCT',	'NOV',	'DEC',	'ANNUAL',	'JF',	'MAM',	'JJAS',	'OND']] = StandardScaler().fit_transform(rainfall_df[['JAN',	'FEB',	'MAR', 'APR',	'MAY',	'JUN',	'JUL',	'AUG',	'SEP',	'OCT',	'NOV',	'DEC',	'ANNUAL',	'JF',	'MAM',	'JJAS',	'OND']])

Split train and test sets

In [19]:
div_data = np.asarray(rainfall_df[['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']])

X = None; y = None
for i in range(div_data.shape[1]-3):
    if X is None: 
        X = div_data[:, i:i+3] # Three consecutive months
        y = div_data[:, i+3] # Next (fourth) month
    else:
        X = np.concatenate((X, div_data[:, i:i+3]), axis=0) # Three consecutive months
        y = np.concatenate((y, div_data[:, i+3]), axis=0) # Next (fourth) month
        
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [20]:
X_test

array([[-0.22100623,  0.69202668, -0.65303125],
       [ 0.39610828,  0.1404294 , -0.26809356],
       [-0.63183432, -0.92095813, -0.5102474 ],
       ...,
       [-0.44962175,  0.18922438, -0.94644743],
       [-0.50727306, -1.09992949, -0.43227464],
       [-0.5127669 , -0.59306676, -0.57086419]])

Perform Regression

In [21]:
rf = RandomForestRegressor(n_estimators = 200, max_depth=10)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

Evaluate

In [22]:
mean_absolute_error(y_test, y_pred)

0.5249428060081469

# **USA EARTHQUAKES**

**TARGET:** PREDICT MAGNITUDE

In [23]:
earthquake_df = pd.read_csv('Datasets/earthquake-all-month.csv')

In [24]:
earthquake_df['type'].value_counts().index


Index(['earthquake', 'quarry blast', 'explosion', 'ice quake', 'other event'], dtype='object')

In [25]:
import re
earthquake_df['short place']=[re.findall(r'\w+',i)[-1] for i in earthquake_df['place']]
earthquake_df.dropna(subset=['mag'],inplace=True)

Feature vector

In [26]:
features=[i for i in earthquake_df.columns if earthquake_df[i].isna().sum()==0]

for i in ['mag','place','time','id','updated','net','magType']: 
    features.remove(i)
    
X=earthquake_df[features]
y=earthquake_df[['mag']]

In [27]:
X[['depth']] = StandardScaler().fit_transform(X[['depth']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


Segregate categorical data

In [28]:
categorical = []
for i in features:
    if earthquake_df[i].dtype=="object":
        categorical.append(i)

Encode the data

In [29]:
from sklearn import preprocessing
label_maps = {}
for i in categorical:
    le = preprocessing.LabelEncoder().fit(X[i])
    X[i]=le.transform(X[i])
    d = dict(zip(le.classes_, le.transform(le.classes_)))
    label_maps[i] = d
print(label_maps)

{'type': {'earthquake': 0, 'explosion': 1, 'ice quake': 2, 'other event': 3, 'quarry blast': 4}, 'status': {'automatic': 0, 'reviewed': 1}, 'locationSource': {'ak': 0, 'av': 1, 'ci': 2, 'hv': 3, 'mb': 4, 'nc': 5, 'nm': 6, 'nn': 7, 'ok': 8, 'pr': 9, 'se': 10, 'tx': 11, 'us': 12, 'uu': 13, 'uw': 14}, 'magSource': {'ak': 0, 'av': 1, 'ci': 2, 'guc': 3, 'hv': 4, 'mb': 5, 'nc': 6, 'nm': 7, 'nn': 8, 'ok': 9, 'pr': 10, 'se': 11, 'tx': 12, 'us': 13, 'uu': 14, 'uw': 15}, 'short place': {'Afghanistan': 0, 'Africa': 1, 'Alabama': 2, 'Alaska': 3, 'Albania': 4, 'Algeria': 5, 'America': 6, 'Argentina': 7, 'Arizona': 8, 'Arkansas': 9, 'Austria': 10, 'Barbados': 11, 'Bhutan': 12, 'Bolivia': 13, 'Brazil': 14, 'CA': 15, 'Caledonia': 16, 'California': 17, 'Canada': 18, 'Chile': 19, 'China': 20, 'Colombia': 21, 'Colorado': 22, 'Congo': 23, 'Dakota': 24, 'Dominica': 25, 'Ecuador': 26, 'Fiji': 27, 'Georgia': 28, 'Greece': 29, 'Guadeloupe': 30, 'Guam': 31, 'Guatemala': 32, 'Guinea': 33, 'Guyana': 34, 'Haiti':

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [30]:
label_maps.keys()

dict_keys(['type', 'status', 'locationSource', 'magSource', 'short place'])

Train and test split

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.10)

In [32]:
X_train.head()

Unnamed: 0,latitude,longitude,depth,rms,type,depthError,status,locationSource,magSource,short place
5158,33.5905,-116.809167,-0.267634,0.13,0,0.45,1,2,2,15
4845,36.136333,-117.820167,-0.365934,0.1,0,0.34,1,2,2,15
7124,36.46,-98.760333,-0.242549,0.03,0,0.1,1,8,9,71
5656,35.680333,-117.531,-0.188504,0.17,0,0.47,1,2,2,15
1119,17.9435,-66.8218,-0.147512,0.1,0,0.36,1,9,10,81


Random forest regressor

In [33]:
from sklearn.datasets import make_classification
clf = RandomForestRegressor(n_estimators=100, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)
clf.fit(X_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [34]:
y_pred = clf.predict(X_test)

Evaluate

In [35]:
mean_absolute_error(y_test, y_pred)

0.2939576443882924