In [None]:

#Idea: Prediction of groundwater levels: I came across a research paper titled 
#"Groundwater Prediction Using Machine-Learning Tools" (https://www.mdpi.com/1999-4893/13/11/300) 
#and thought of comparing results from regression analysis, SVM and random forests to predict the 
#groundwater levels in India. I use values of precipitation (pr), altitude (alt), 
#average annual temperature (at) and distance from sea (dsea) to predict the groundwater levels.  

#Progress: Managed to run basic GLM and SVM models - I need to undersand the math better to improve them.


In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

def clear_dataframe():
    return pd.DataFrame()

In [None]:
#Converting the dta files from American Economic Association Journal's Appendix (paper titled "Wells, Water, and Welfare: The Impact of Access to Groundwater on Rural Poverty and Conflict")

iac = pd.io.stata.read_stata(r'C:\Users\vishe\Downloads\113902-V1\Data-and-Read-Me\AEJ_IAC.dta')
iac.to_csv(r'C:\Users\vishe\Downloads\113902-V1\Data-and-Read-Me\AEJ_IAC.csv')

In [None]:

def aquifer_depth_str(dmaq2, dmaq3):
    if dmaq3 == 1:
        return 'high'
    elif dmaq2 == 1:
        return 'medium'
    else:
        return 'low'

# Create a new column 'aquifer_depth_str'
iac['aquifer_depth_str'] = iac.apply(lambda row: aquifer_depth_str(row['dmaq2'], row['dmaq3']), axis=1)

def aquifer_depth(dmaq2, dmaq3):
    if dmaq3 == 1:
        return 3
    elif dmaq2 == 1:
        return 2
    else:
        return 1

# Create a new column 'aquifer_depth'
iac['aquifer_depth'] = iac.apply(lambda row: aquifer_depth(row['dmaq2'], row['dmaq3']), axis=1)

columns_to_concat = ['alt', 'dsea', 'at', 'pr', 'aquifer_depth_str', 'aquifer_depth']
iac = pd.concat([iac[columns_to_concat]], axis=1)


In [None]:
#Understanding the data

columns_analyse=['alt', 'dsea', 'at', 'pr', 'aquifer_depth']

iac_analyse=iac[columns_analyse]

#correlation
correlation = iac_analyse.corr()
#tick labels
matrix_cols = correlation.columns.tolist()
#convert to array
corr_array  = np.array(correlation)
#Plotting
trace = go.Heatmap(z = corr_array,
                   x = matrix_cols,
                   y = matrix_cols,
                   xgap = 2,
                   ygap = 2,
                   colorscale='Rainbow',
                   colorbar   = dict() ,
                  )
layout = go.Layout(dict(title = 'Correlation Matrix',
                        autosize = False,
                        height  = 720,
                        width   = 800,
                        margin  = dict(r = 0 ,l = 210,
                                       t = 25,b = 210,
                                     ),
                        yaxis   = dict(tickfont = dict(size = 9)),
                        xaxis   = dict(tickfont = dict(size = 9)),
                       )
                  )
fig = go.Figure(data = [trace],layout = layout)
fig.show()

In [None]:
iac.head

In [None]:
#Method 1: SVM

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


X = iac[['alt','dsea', 'at', 'pr']]  
y = iac['aquifer_depth_str']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)


print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# Create an SVM classifier
svm_classifier = SVC(kernel='linear', C=1)


svm_classifier.fit(X_train, y_train)


y_pred = svm_classifier.predict(X_test)


print("accuarcy score is : ", accuracy_score(y_test, y_pred))


In [None]:
#Method 2: Generalised Linear Model

"""
Need to work on this further - check the dataset for missing values



model = smf.glm(formula = "aquifer_depth ~ alt + dsea + at + pr", 
                data = iac, 
                family = sm.families.Binomial())

# Fit the model
result = model.fit()
# Display and interpret results
print(result.summary())

"""
