# **The steps for the proposed solution**
1. Read the train and test data as a csv file
2. Check for null values
3. Adding new features 
4. Data visualisaion
5. Features selection
6. Scale data
7. Encode data
8. Creating and Training the models
9. Make the preditions
10. Make the ouput file


# **1. Read the train and test data as a csv file**
Load the data for the competition.

In [0]:
from zipfile import ZipFile
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [0]:
# get the data from google drive
from google.colab import drive
drive.mount('/content/drive')

In [0]:
#container folder with project files
path = '/content/drive/My Drive/UmojaHack Datasets'

# path to the challenge files 
data_path = path + '/UmojaHack#3:Hotspots.zip'
password = 'e78sy8'     # password to unlock the data


with ZipFile(data_path, 'r') as zip:
  zip.printdir()
  print("Extracting all files...")
  zip.extractall(pwd = bytes(password, 'utf-8'))
  print("Done extraction...")

In [0]:
# import the needed libraries
import pandas as pd
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from matplotlib import pyplot as plt
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model

In [0]:
train = pd.read_csv('UmojaHack#3:Hotspots/train.csv', parse_dates=['date'])
train.head()

In [0]:
test = pd.read_csv('UmojaHack#3:Hotspots/test.csv', parse_dates=['date'])
test.head()

# **2. Check for null values**


In [0]:
for col in train.columns:
  print(col,"total null = ", train[col].isnull().sum(), "total = ", len(set(train[col])))
print(train.columns)
total_areas = 3821

# **3. Adding new features**


In [0]:
# Date variables
train['month'] = train.date.dt.month
test['month'] = test.date.dt.month

# **4. Data visualisaion**



In [0]:
# Plotting mean burn_area for each month - very strong mid-year peak (dry season)
train.groupby('month').mean().reset_index().plot(y='burn_area', x='month', kind='bar')

# **5. Features selection**


In [0]:
# Define input and output columns
in_cols = ['climate_aet',
       'climate_def', 'climate_pet', 'climate_pr',
       'climate_srad',
       'climate_tmmn', 'climate_tmmx', 'climate_vap', 'climate_vpd',
       'climate_vs', 'elevation', 'landcover_2',
       'landcover_4',
        'precipitation', 'month']
target_col = 'burn_area'
in_cols

In [0]:
from sklearn.model_selection import train_test_split
sub_features = test[in_cols]
features = train[in_cols]
labels = train[target_col]

start_sub = len(features)
features = pd.concat([features, sub_features], axis=0, sort = False)



# **6. Scale data**


In [0]:
#scale data
def scale(df, cols):     
    for col in cols:
        df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
    return df
features = scale(features, in_cols)
features.head()

# **7. Encode data**


In [0]:
# Encode string data

def oneHotEncode(df, col):
    df[col] = pd.Categorical(df[col])
    dfDummies = pd.get_dummies(df[col], prefix = col)
    df = pd.concat([df, dfDummies], axis=1)
    df.drop([col], axis = 1, inplace = True)
    return df
features = oneHotEncode(features, "month")


test = features[start_sub:]
features = features[:start_sub]

In [0]:
features.head()

# **8. Creating and Training the models**



In [0]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVR
areas = [[] for i in range(total_areas)]
models = [None for i in range(total_areas)]
print(total_areas)
# going through each area and create its own model
for i in range(total_areas):
  print((i + 1) / total_areas * 100 , "%")
  j = i
  while (j < len(train)):
    areas[i].append(j)
    j += total_areas

  small_feat = features.loc[areas[i]]
  small_labs = labels[areas[i]]
  models[i] = RandomForestRegressor()
  models[i].fit(small_feat, small_labs)
  
  
print(areas[0])

# **9. Make the preditions**


In [0]:
# make the prediction for the test file and save the results
areas_sub = [[] for i in range(total_areas)]
preds = [0] * len(test)
for i in range(total_areas):
  print((i + 1) / total_areas * 100 , "%")
  j = i
  while (j < len(test)):
    areas_sub[i].append(j)
    j += total_areas

  small_feat = test.loc[areas_sub[i]]
  
  pred = models[i].predict(small_feat)

  j = i
  cp = 0
  while (j < len(test)):
    preds[j] = pred[cp]
    cp += 1
    j += total_areas
  
  
print(areas[0])

# **10. Make the output file**


In [0]:
# create the submission file
ss = pd.read_csv('UmojaHack#3:Hotspots/SampleSubmission.csv')
ss['Prediction'] = preds
ss.head()

In [0]:

ss.to_csv('starter_submission_16.csv', index=False)