<a href="https://colab.research.google.com/github/tanyarw/disaster-prediction/blob/main/Disaster_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Mount content

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
os.chdir("drive/My Drive/DV: Disaster Prediction")

Import Libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# **GLOBAL LANDSLIDES**

Read Dataset

In [4]:
landslide_df = pd.read_csv('Datasets/NASA_Global_Landslide_Catalog.csv')

Drop unwanted features

In [5]:
landslide_df = landslide_df.drop(['source_name', 'source_link','event_id', 'event_date','event_time',
                        'event_title', 'event_description', 'location_description','storm_name','photo_link',
                        'notes', 'event_import_source','event_import_id','country_code','submitted_date', 
                        'created_date', 'last_edited_date','admin_division_name','gazeteer_closest_point', 'gazeteer_distance','injury_count'], axis = 1)

Drop unknown categories

In [6]:
to_remove = landslide_df[ (landslide_df['landslide_category'] == 'unknown') ].index
landslide_df = landslide_df.drop(to_remove)
to_remove = landslide_df[(landslide_df['location_accuracy'] == 'unknown')].index
landslide_df = landslide_df.drop(to_remove)

Replace or drop unknown/NaN values

In [7]:
landslide_df = landslide_df.dropna(subset=['location_accuracy', 'landslide_category','landslide_trigger','landslide_size','landslide_setting','country_name'])

Determine feature and target vectors

In [8]:
X_features = list(landslide_df.columns)
X_features.remove('fatality_count')
y = landslide_df['fatality_count']
y = y.fillna(y.median()) # deal with na

One hot encoding of categorical data

In [9]:
encode_df = pd.get_dummies(landslide_df[X_features])
X = encode_df

Train and test split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

Perform regression

In [11]:
clf = RandomForestRegressor(n_estimators=150, max_depth = None, criterion='mse')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

Evaluation

In [12]:
mean_absolute_error(y_test, y_pred)

6.824497099008048

# **INDIAN RAINFALL**

Read dataset

In [13]:
rainfall_df = pd.read_csv('Datasets/rainfall_india_1901-2017.csv')

Deal with NaN values

In [14]:
rainfall_df.fillna(value = 0, inplace = True)

Split train and test sets

In [15]:
div_data = np.asarray(rainfall_df[['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']])

X = None; y = None
for i in range(div_data.shape[1]-3):
    if X is None: 
        X = div_data[:, i:i+3] # Three consecutive months
        y = div_data[:, i+3] # Next (fourth) month
    else:
        X = np.concatenate((X, div_data[:, i:i+3]), axis=0) # Three consecutive months
        y = np.concatenate((y, div_data[:, i+3]), axis=0) # Next (fourth) month
        
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

Perform Regression

In [16]:
rf = RandomForestRegressor(n_estimators = 200, max_depth=10)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

Evaluate

In [17]:
mean_absolute_error(y_test, y_pred)

83.56942289876253

# **USA EARTHQUAKES**

In [18]:
earthquake_df = pd.read_csv('Datasets/earthquake-all-month.csv')

In [19]:
import re
earthquake_df['short place']=[re.findall(r'\w+',i)[-1] for i in earthquake_df['place']]
earthquake_df.dropna(subset=['mag'],inplace=True)

Feature vector

In [20]:
features=[i for i in earthquake_df.columns if earthquake_df[i].isna().sum()==0] # features include only place, type and source of an earthquake

for i in ['mag','place','time','id','updated','net','magType','depth']:
    features.remove(i)
    
X=earthquake_df[features]
y=earthquake_df[['mag','depth', 'depthError']] # we try to predict magnitude, depth as well as depthError  

Segregate categorical data

In [21]:
categorical = []
for i in features:
    if earthquake_df[i].dtype=="object":
        categorical.append(i)

Encode the data

In [22]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for i in categorical:
    X[i]=le.fit_transform(X[i])
for i in [i for i in y.columns if y[i].dtype=='object']:
    y[i]=le.fit_transform(y[i])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Train and test split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.10)

Random forest regressor

In [24]:
from sklearn.datasets import make_classification
clf = RandomForestRegressor(n_estimators=100, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, ccp_alpha=0.0, max_samples=None)
clf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [25]:
y_pred = clf.predict(X_test)

Evaluate

In [26]:
mean_absolute_error(y_test, y_pred)

2.320238053912668