In [None]:
import numpy as np
import pandas as pd
import collections #used to count frequency of elements in a list
from eda_functions import *
#displays the output inline
#%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

#modeling
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import

df = pd.read_csv("Bias_correction_ucl.csv")
df.head()



df_dimensions(df)
cols_missing_values(df)


#Checking if there is a large percentage of missing data
most_missing_values(df, .75)



#Checking if there is a large percentage of missing data
most_missing_values(df, .5)



#Checking in detail the percentage of missing data
df.isnull().mean()*100

df = df.dropna(axis=0)
df_dimensions(df)



#Checking in detail the percentage of missing data
df.isnull().mean()*100




#Calculating the percentage of data removed after droping rows with missing data
#For this purpose the result of the function df_dimensions() was used before and after dropping the rows with missing data
(7752 -7588)/7752 *100

categorical_cols(df)

date = df.Date.value_counts()
(date/df.shape[0]).plot(kind="bar");
plt.title("Date");

len(df.Date.value_counts())


#Checking the date format to see how we are going to separate year, month, and day info
df.Date[0]


df.Date[0][:4] #year


df.Date[0][5:7] #month


df.Date[0][8:] #day

df["year"] = df["Date"].apply(lambda x: x[:4]) #get the year from the date
df["month"] = df["Date"].apply(lambda x: x[5:7]) #get the month from the date
df["day"] = df["Date"].apply(lambda x: x[8:]) #get the day from the date
df[["year", "month", "day"]]

df.year.value_counts()


years = ["2013", "2014", "2015", "2016", "2017"]
for y in years:
    check_data_month(df, y)
    print("______________ \n")



df.groupby('month').year.value_counts().unstack(0).plot.bar()
plt.ylabel('Total number of registers per month')
plt.title("Total number of registers");


#Analyzing date info for every year and every month
months = ["06", "07", "08"]
for y in years:
    print("Year: ", y)
    print("- - - - - - -")
    for m in months:
        print("Month: ", m)
        check_data_day(df, y, m)
        print("\n")
    print("______________ \n")



numerical_cols(df)


df.describe()

df.hist(figsize = (14,12));

fig, ax = plt.subplots(figsize=(15,15)) 
sns.heatmap(df.corr(), annot=True, fmt=".2f")


important_columns = ["Next_Tmax", "Next_Tmin"]
for col in important_columns:
    year_ditribution(df,col)

for y in years:
    month_distribution(df,"Next_Tmax" , y)

df.plot(x="lon", y="lat", kind="scatter", c="Next_Tmax", colormap="YlOrRd")
plt.show()



#Droping unecessary columns
df.drop(columns= ["station", "Date"], inplace=True)
df.columns

#Transforming date columns from string to int
columns_date = ["year", "month", "day"]
for col in columns_date:
    df = string_into_int(df, col)

#Split data in X and y
X = X = df.drop(["Next_Tmax", "Next_Tmin"], axis='columns')
y_max = df["Next_Tmax"]
y_min = df["Next_Tmin"]

print(X.shape)
print(y_max.shape)
print(y_min.shape)

#Split into train and test
X_train_max, X_test_max, y_train_max, y_test_max = train_test_split(X, y_max, test_size=.30, random_state=42)
X_train_min, X_test_min, y_train_min, y_test_min = train_test_split(X, y_min, test_size=.30, random_state=42)

#Instantiate model
lm_model_max = LinearRegression(normalize=True)
lm_model_min = LinearRegression(normalize=True)

#Fit
lm_model_max.fit(X_train_max, y_train_max)
lm_model_min.fit(X_train_min, y_train_min)

#Predict and score the model
y_test_preds_max = lm_model_max.predict(X_test_max)
y_test_preds_min = lm_model_min.predict(X_test_min)

rsquared_score_max = r2_score(y_test_max, y_test_preds_max)
rsquared_score_min = r2_score(y_test_min, y_test_preds_min)


#Rsquared and y_test
length_y_test_max = len(y_test_preds_max)
length_y_test_min = len(y_test_preds_min)
print("The r-squared score for your model_max was {} on {} values.".format(rsquared_score_max, length_y_test_max))
print("The r-squared score for your model_min was {} on {} values.".format(rsquared_score_min, length_y_test_min))

#To check which features matter in the model, check the weight of the coeficients
#Because the features are normalized we can look at how large the coeficient is
coef_df_max = coef_weights(lm_model_max.coef_, X_train_max, lm_model_max)
coef_df_max.head(20)

coef_df_min = coef_weights(lm_model_min.coef_, X_train_min, lm_model_min)
coef_df_min.head(20)







