# Part 1: New York City Taxi Fare Prediction

# 1.Data Cleaning and Visualization

In [4]:
# I am importing inbuilt libraries which are necessary for Data Cleaning and Visualization 

import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import pycountry
from statistics import mean, stdev
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn import datasets, linear_model, metrics
from scipy.stats import kstest
from scipy import stats
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import make_regression
from sklearn.linear_model import SGDRegressor
import statsmodels.api as sm
import plotly.express as px
from scipy.stats import kstest
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
pd.options.mode.chained_assignment = None
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [1]:
# Loading data set in train dataset which have 1 lakh entries
train = pd.read_csv("Desktop/DATASET/train.csv")
train.head()

In [2]:
# Check no of rows & columns in the dataset
print("Shape of the Training data :", train.shape)

# See all the columns in the dataset
print("All columns are :", list(train.columns))

In [3]:
# Check datatypes & count of not-null values in each field
train.info()

In [None]:
#check for missing values in train data
train.isna().sum()

In [None]:
#drop the missing values
train = train.dropna()

In [None]:
train.isna().sum()

In [None]:
train['key'] = pd.to_datetime(train['key'])
train['pickup_datetime']  = pd.to_datetime(train['pickup_datetime'])

In [None]:
data = [train]
for i in data:
    i['Year'] = i['pickup_datetime'].dt.year
    i['Month'] = i['pickup_datetime'].dt.month
    i['Date'] = i['pickup_datetime'].dt.day
    i['Day of Week'] = i['pickup_datetime'].dt.dayofweek
    i['Hour'] = i['pickup_datetime'].dt.hour

In [None]:
#decscribe() calculate number of rows and also calculate mean,std of all entries of column and also 
# find min and max value from the columns 
train.describe()

In [None]:
# Here i am droping 'Unnamed: 0' column 
df = pd.DataFrame(train, columns=['pickup_datetime','pickup_longitude', 'pickup_latitude', 
                                 'dropoff_longitude', 'dropoff_latitude','passenger_count','fare_amount','Year', 'Month', 'Date',
       'Day of Week', 'Hour'])
df

In [None]:
# Latitudes range from -90 to 90.
# Longitudes range from -180 to 180.
# droping irrelevant entries of Latitudes,Longitudes  
df = df[df['pickup_latitude']>=-90]
df = df[df['pickup_latitude']<=90]
df = df[df['dropoff_latitude']>=-90]
df = df[df['dropoff_latitude']<=90]
df = df[df['pickup_longitude']>=-180]
df = df[df['pickup_longitude']<=180]
df = df[df['dropoff_longitude']>=-180]
df = df[df['dropoff_longitude']<=180]

In [None]:
# we are using haversine_distance function to calculate distance between pickup point to droping point
# from given pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude

def haversine_distance(lat1, long1, lat2, long2):
    data = [df]
    for i in data:
        R = 6371  #radius of earth in kilometers
        phi1 = np.radians(i[lat1])
        phi2 = np.radians(i[lat2])
    
        delta_phi = np.radians(i[lat2]-i[lat1])
        delta_lambda = np.radians(i[long2]-i[long1])
    
        #a = sin²((φB - φA)/2) + cos φA . cos φB . sin²((λB - λA)/2)
        a = np.sin(delta_phi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) ** 2
    
        #c = 2 * atan2( √a, √(1−a) )
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
        #d = R*c
        d = (R * c) #in kilometers
        i['H_Distance'] = d
    return d

In [None]:
# Calling haversine_distance function which is declare above
distance = haversine_distance('pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')

In [None]:
# here is output after adding H_Distance
df

In [None]:
# droping 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude' and 'pickup_datetime' columns from df dataframe
# because they are mandatory for further work

del df['pickup_datetime']
del df['pickup_longitude']
del df['pickup_latitude']
del df['dropoff_longitude']
del df['dropoff_latitude']


In [None]:
#Look for no. of non-positive fare_amount values.
df[df['fare_amount']<=0]

In [None]:
# droping all rows having negative H_Distance(meaningless data) or having negative fare_amount or having negative passenger_count
df = df[df['H_Distance']>0]
df = df[df['fare_amount']>0]
df = df[df['passenger_count']>0]

In [None]:
df

In [None]:
# Here i am adding new column fare_amount_per_KM
# and it is function to calculate fare_amount_per_KM for each entry 
def fare_amount_per_KM(col1, col2):
    data = [df]
    for i in data:
        ans = i[col1]/i[col2]
        i['Fare_Amount_Per_KM'] = ans

In [None]:
# calling fare_amount_per_KM function
fare_amount_per_KM('fare_amount','H_Distance')

In [None]:
# df dataframe after adding fare_amount_per_KM column
df

In [None]:
# correlation between columns 
df.corr()

In [None]:
df.describe()

In [None]:
#Removing outliers with the help of plots

# (1) removing outliers from fare_amount column with the help of violinplot
plt.figure(figsize=(25,10))
sns.violinplot(df["fare_amount"], color='#008000')

In [None]:
df = df[df['fare_amount']<=80]

In [None]:
plt.figure(figsize=(25,10))
sns.violinplot(df["fare_amount"], color='#008000')

In [None]:
df = df[df['fare_amount']<=59]

In [None]:
plt.figure(figsize=(25,10))
sns.violinplot(df["fare_amount"], color='#008000')

In [None]:
# (2) removing outliers from H_Distance column with the help of violinplot
plt.figure(figsize=(25,10))
sns.violinplot(df["H_Distance"], color='#008000')

In [None]:
df = df[df['H_Distance']<1000]

In [None]:
plt.figure(figsize=(25,10))
sns.violinplot(df["H_Distance"], color='#008000')

In [None]:
df = df[df['H_Distance']<100]

In [None]:
plt.figure(figsize=(25,10))
sns.violinplot(df["H_Distance"], color='#008000')

In [None]:
df = df[df['H_Distance']<23]

In [None]:
plt.figure(figsize=(25,10))
sns.violinplot(df["H_Distance"], color='#008000')

In [None]:
# (3) removing outliers from Fare_Amount_Per_KM column with the help of violinplot
plt.figure(figsize=(25,10))
sns.violinplot(df["Fare_Amount_Per_KM"], color='#008000')

In [None]:
df = df[df['Fare_Amount_Per_KM']<1000]

In [None]:
plt.figure(figsize=(25,10))
sns.violinplot(df["Fare_Amount_Per_KM"], color='#008000')

In [None]:
df = df[df['Fare_Amount_Per_KM']<50]

In [None]:
plt.figure(figsize=(25,10))
sns.violinplot(df["Fare_Amount_Per_KM"], color='#008000')

In [None]:
df = df[df['Fare_Amount_Per_KM']<15]

In [None]:
plt.figure(figsize=(25,10))
sns.violinplot(df["Fare_Amount_Per_KM"], color='#008000')

In [None]:
# ploting graph between 'fare_amount' and 'H_Distance'
temp_df = df[['fare_amount','H_Distance']].sort_values('H_Distance')
plt.plot(temp_df['H_Distance'],temp_df['fare_amount'])

In [None]:
# ploting graph between 'fare_amount' and 'Year'
# It shows how fare_amount increasing as year incresing
temp_df = df[['fare_amount','Year']].sort_values('Year')
plt.plot(temp_df['Year'],temp_df['fare_amount'])

In [None]:
plt.plot(df.index,df.fare_amount)
plt.show()

In [None]:
plt.plot(df.H_Distance,df.fare_amount)
plt.show()

In [None]:
plt.plot(df.fare_amount,df.passenger_count)
plt.show()

In [None]:
plt.plot(df.Fare_Amount_Per_KM,df.fare_amount)
plt.show()

In [None]:
plt.plot(df.index,df.fare_amount)
plt.show()


In [None]:
# Calculation of the coefficient of determination of the prediction.

X = df[['H_Distance','Year','Month',"Date",'Day of Week','Hour']]
y = df['fare_amount']

In [None]:
arr = []
# printing the length of X and y
print(len(X),len(y))
for index, val in enumerate(zip(X, y)):
    arr.append([val[0], val[1]])

In [None]:
Linear = LinearRegression()

In [None]:
ANS = Linear.fit(np.array(X), y)

In [None]:
ANS.score(np.array(X), y)

# 2.DATA SCALING

In [None]:
#creating another copy df dataset
df_1 = pd.DataFrame(df.copy())

# Before scaling
df_1

In [None]:
# Here we are doing data scaling because sacling of data makes it easy to train model
scaler = StandardScaler()

In [None]:
# Here i am doing scaling of df_1 dataframe
for col in ['passenger_count','fare_amount','H_Distance','Fare_Amount_Per_KM','Year','Month','Date','Day of Week','Hour']:
    df_1[col] = scaler.fit_transform(df_1[[col]])

In [None]:
# After scaling
df_1.head()

In [None]:
# sorting df_1 dataframe according H_Distance
df_1 = df_1.sort_values('H_Distance')

# 3.Building a Pipeline

In [None]:
# Data_processing_Visualization class for adding H_Distance,splitting pickup_datetime into Year,day.. and also for removing outliers  

class Data_processing_Visualization(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self,train, y=None):
        return self
    
    def transform(self,train):
        
        #         ============================================================
        train = train[['key', 'fare_amount','pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count']]
        
        
    

        train = train.dropna()
        
#         ============================================================
        train['key'] = pd.to_datetime(train['key'])
        train['pickup_datetime']  = pd.to_datetime(train['pickup_datetime'])
        
#         ==============================================================
        data = [train]
        for i in data:
            i['Year'] = i['pickup_datetime'].dt.year
            i['Month'] = i['pickup_datetime'].dt.month
            i['Date'] = i['pickup_datetime'].dt.day
            i['Day of Week'] = i['pickup_datetime'].dt.dayofweek
            i['Hour'] = i['pickup_datetime'].dt.hour
            
#        =============================================================
        df = pd.DataFrame(train, columns=['pickup_datetime','pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 
                'dropoff_latitude','passenger_count','fare_amount','Year', 'Month', 'Date','Day of Week', 'Hour'])
    
#     ===============================================================
        df = df[df['pickup_latitude']>=-90]
        df = df[df['pickup_latitude']<=90]
        df = df[df['dropoff_latitude']>=-90]
        df = df[df['dropoff_latitude']<=90]
        df = df[df['pickup_longitude']>=-180]
        df = df[df['pickup_longitude']<=180]
        df = df[df['dropoff_longitude']>=-180]
        df = df[df['dropoff_longitude']<=180]
        lat1 ='pickup_latitude'
        long1 = 'pickup_longitude'
        lat2 = 'dropoff_latitude'
        long2 = 'dropoff_longitude'
        
        data = [df]
        for i in data:
            R = 6371  #radius of earth in kilometers
            phi1 = np.radians(i[lat1])
            phi2 = np.radians(i[lat2])

            delta_phi = np.radians(i[lat2]-i[lat1])
            delta_lambda = np.radians(i[long2]-i[long1])

            #a = sin²((φB - φA)/2) + cos φA . cos φB . sin²((λB - λA)/2)
            a = np.sin(delta_phi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) ** 2

            #c = 2 * atan2( √a, √(1−a) )
            c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))

            #d = R*c
            d = (R * c) #in kilometers
            i['H_Distance'] = d
#          ==============================================================
        del df['pickup_datetime']
        del df['pickup_longitude']
        del df['pickup_latitude']
        del df['dropoff_longitude']
        del df['dropoff_latitude']
        df = df[df['H_Distance']>0]
        df = df[df['fare_amount']>0]
        df = df[df['passenger_count']>0]
        
        
#         ========================================================
        col1 = 'fare_amount'
        col2 = 'H_Distance'
        data = [df]
        for i in data:
            ans = i[col1]/i[col2]
            i['Fare_Amount_Per_KM'] = ans
#         ====================================================
        df = df[df['H_Distance']<23]
        df = df[df['Fare_Amount_Per_KM']<15]
        df = df[df['fare_amount']<=59]
        
        return df
#         ========================================================
 

In [None]:
train_1 = pd.read_csv("Desktop/DATASET/train.csv")

In [None]:
#just checking Data_processing_Visualization class working properly or not 
x = Data_processing_Visualization()
x.fit_transform(train_1)

In [None]:
# Another class for scaling
class Data_Scaling(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self,train, y=None):
        return self
    def transform(self,df):
        scaler = StandardScaler()
        for col in ['passenger_count','fare_amount','H_Distance','Fare_Amount_Per_KM','Year','Month','Date','Day of Week','Hour']:
            df[col] = scaler.fit_transform(df[[col]])
        return df    
            

In [None]:
Data_Pipline = Pipeline([('Data_1',Data_processing_Visualization()),('Data_2',Data_Scaling())])
ans = Data_Pipline.fit_transform(train_1)

In [None]:
# train_1 dataframe After removing outliers and Scaling
ans

# 4.Use Of Validation Set and Cross Validation Approach

In [None]:
# Here we are fixing value of k for k_cross folding validation
n_splits = 10

In [None]:
Validation_set = df_1[['Year','passenger_count','fare_amount','H_Distance','Fare_Amount_Per_KM']].copy()
Validation_set

In [None]:
# shuffle the DataFrame rows
Validation_set = Validation_set.sample(frac = 1)
Validation_set

In [None]:
# Checking shape of Validation_set
Validation_set.shape

In [None]:
# First Method  for validation 

In [None]:
#  k-fold cross validation algorithm from scratch to evaluate our model and choose hyper-parameters using libraries

# Here i am declaring some array for futher calculation to store Actual value of fare_amount,
# Predicted value of fare_amount and H_Distance


Actual_value_1 = np.array([])
Predicted_Value_1 = np.array([])
Distance_value_1 = np.array([])

total_range = len(Validation_set)//n_splits
# total_range length of testing data and remaining is training data

Index = 0
Score = 0


while Index<n_splits:    
    
    test_data = Validation_set[Index*total_range:Index*total_range+total_range:]
    notcommonentries = Validation_set[~Validation_set.isin(test_data)]
    
    train_data = notcommonentries.dropna()
    Fare_amount_train = train_data['fare_amount']
    Fare_amount_test = test_data['fare_amount']
    
#     print(Fare_amount_train)   
#     print(Fare_amount_test)

    del train_data['fare_amount']
    del test_data['fare_amount']
    
    data_fit = LinearRegression()
    data_fit.fit(train_data, Fare_amount_train)
    test_prediction = data_fit.predict(test_data)
    
    
    df_actual_pred = pd.DataFrame({'Actual': Fare_amount_test.squeeze(), 'Predicted': test_prediction.squeeze()})
    Actual= df_actual_pred['Actual']
    predict = df_actual_pred['Predicted']
    distance = test_data['H_Distance']

    Actual_value_1 = np.append (Actual_value_1, Actual)
    Predicted_Value_1 = np.append (Predicted_Value_1, predict)
    Distance_value_1 = np.append (Distance_value_1, distance)
    
    Index+=1
#     print(Distance_value_1)
#     print(Actual_value_1)
#     print(Predicted_Value_1)
    
      
   

In [None]:
Final_df_1 =  pd.DataFrame({
    'Distance':Distance_value_1,
    'Actual_Value': Actual_value_1,
    'Predicted_Value':Predicted_Value_1
})
Final_df_1

In [None]:
# Calculation of mean_squared_error 
MSE=np.array((Final_df_1['Actual_Value']-Final_df_1['Predicted_Value'])**2)
print('mean_squared_error is',np.mean(MSE))

In [None]:
plt.rcParams['figure.figsize']=[12,10]
plt.scatter(Final_df_1['Distance'],Final_df_1['Actual_Value'])
plt.plot(Final_df_1['Distance'],Final_df_1['Predicted_Value'],color='green')
plt.show()

In [None]:
# second Method for validation 

In [None]:
#  k-fold cross validation algorithm from scratch to evaluate our model and choose hyper-parameters using libraries

# Here i am declaring some array for futher calculation to store Actual value of fare_amount,
# Predicted value of fare_amount and H_Distance


Actual_value_2 = np.array([])
Predicted_Value_2 = np.array([])
Distance_value_2 = np.array([])

total_range = len(Validation_set)//n_splits
# total_range length of testing data and remaining is training data

Index = 0
Score = 0

while Index<n_splits:    
    test_data = Validation_set[Index*total_range:Index*total_range+total_range:]
    notcommonentries = Validation_set[~Validation_set.isin(test_data)]
    
    train_data = notcommonentries.dropna()
    Fare_amount_train = train_data['fare_amount']
    Fare_amount_test = test_data['fare_amount']
    
#     print(Fare_amount_train)   
#     print(Fare_amount_test)

    del train_data['fare_amount']
    del test_data['fare_amount']
    
    A_MAT = train_data.to_numpy()
    pseudo_inverse = np.linalg.pinv(A_MAT)
    ANS = np.dot(pseudo_inverse,Fare_amount_train)
    Final_ANS = np.dot(test_data,ANS)
    

    Actual_value_2 = np.append (Actual_value_2, Actual)
    Predicted_Value_2 = np.append (Predicted_Value_2, predict)
    Distance_value_2 = np.append (Distance_value_2, distance)
    
    Index+=1
    
#     print(Distance_value_2)
#     print(Actual_value_2)
#     print(Predicted_Value_2)    
    
# ============================================    
    
#     A_MAT = train_data.to_numpy()
#     A_MAT_Trans = np.transpose(A_MAT)
#     Final_ANS = np.dot(A_MAT_Trans,A_MAT)
#     INV = np.linalg.inv(Final_ANS)
#     ans  = np.dot(INV,A_MAT_Trans)
#     ans_1 = np.dot(ans,Fare_amount_train)
#     Final_ANS = np.dot(test_data,np.dot(np.dot(np.linalg.inv(Final_ANS),A_MAT_Trans),Fare_amount_train))



# ===================================================



In [None]:
Final_df_2 =  pd.DataFrame({
    'Distance':Distance_value_2,
    'Actual_Value': Actual_value_2,
    'Predicted_Value':Predicted_Value_2
})
Final_df_2

In [None]:
# Calculation of mean_squared_error 
MSE=np.array((Final_df_2['Actual_Value']-Final_df_2['Predicted_Value'])**2)
print('mean_squared_error is',np.mean(MSE))

In [None]:
plt.rcParams['figure.figsize']=[12,10]
plt.scatter(Final_df_2['Distance'],Final_df_2['Actual_Value'])
plt.plot(Final_df_2['Distance'],Final_df_2['Predicted_Value'],color='green')
plt.show()

# 5.Linear Regression

In [None]:
# Splitting data into X_train, X_test, y_train, y_test with the help of train_test_split libraries
train_data_5 = Validation_set.copy()
y = train_data_5['fare_amount']
X = train_data_5.drop(['fare_amount'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1, random_state=42)

**Matrix Based:**

In [None]:
# calculating mean_squared_error of a model using Matrix Based regressors

linear = linear_model.LinearRegression()
linear.fit(X_train, y_train)
ANS_Linear = linear.predict(X_test)
print('mean_squared_error of a model using Matrix Based regressors',mean_squared_error(ANS_Linear, y_test))

**Optimization Based**

In [None]:
# calculating mean_squared_error of a model using Optimization Based regressors

sgd = SGDRegressor(max_iter=800000,eta0=0.000001)
sgd.fit(X_train,y_train)
ANS_sgd = sgd.predict(X_test)
print('mean_squared_error of a model using Matrix Based regressors',mean_squared_error(ANS_sgd, y_test))

**Non-parametric Based**

In [None]:
# calculating mean_squared_error of a model using Non-parametric based regressors

neighbour = KNeighborsRegressor(n_neighbors=2)
neighbour.fit(X_train,y_train)
ANS_neighbour = neighbour.predict(X_test)
print('mean_squared_error of a model using Matrix Based regressors',mean_squared_error(ANS_neighbour, y_test))

# Part 2: Life Expectancy (WHO)

# 1.Data Cleaning and Visualization

In [None]:
# Loading data set in train dataset
train = pd.read_csv("Desktop/DATASET/Life_Expectancy_Data.csv")
train.head()

In [None]:
train.columns

In [None]:
# creating list_2 to store names of columns of train dataset
list_2 = ['Life expectancy ', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure',
       'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population',
       ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling']

In [None]:
# Here we are ploting multiple histogram for better understanding

figure, axis = plt.subplots(19,1, figsize=(20,80))
for i in range(len(list_2)):
    axis[i].hist(train[list_2[i]], bins=100)
    axis[i].set_title(list_2[i])

In [None]:
train.head()

In [None]:
# checking whether train dataset have NAN value or not
train.isna().sum()

In [None]:
#droping raws which has NAN values
train.dropna(axis=0,how='any',thresh=None, subset=None, inplace=True)
train.isna().sum()

In [None]:
train.shape

In [None]:
#Here we are droping coutry column from train dataset
train = train.drop(['Country'],axis=1)

In [None]:
# Removing outliers from train dataset
train = train[train[' BMI ']<=60]
train = train[train[' BMI ']>0]
train

In [None]:
train.describe()

In [None]:
train_1 = pd.read_csv("Desktop/DATASET/Life_Expectancy_Data.csv")
train_1.dropna(axis=0,how='any',thresh=None, subset=None, inplace=True)

countries_names = train_1['Country']


nation = {}

for country in pycountry.countries:
   nation[country.name] = country.alpha_3

train_1['3 Code'] = [nation.get(country, 'Unknown code') for country in countries_names]



df = train_1.query("Year == 2007")
# df = px.data.gapminder().query("year == 2007")
df.describe()
fig = px.choropleth(df, locations="3 Code",
                    color="Life expectancy ",  # lifeExp is a column of gapminder
                    hover_name="Country",  # column to add to hover information
                    color_continuous_scale=px.colors.sequential.Plasma
                    )
fig.show()


df = train_1.query("Year == 2007")
# df = px.data.gapminder().query("year == 2007")
df.describe()
fig = px.choropleth(df, locations="3 Code",
                    color=" BMI ",  # lifeExp is a column of gapminder
                    hover_name="Country",  # column to add to hover information
                    color_continuous_scale=px.colors.sequential.Plasma
                    )
fig.show()

import plotly.express as px
df = train_1.query("Year == 2007")
# df = px.data.gapminder().query("year == 2007")
df.describe()
fig = px.choropleth(df, locations="3 Code",
                    color='Alcohol',  # lifeExp is a column of gapminder
                    hover_name="Country",  # column to add to hover information
                    color_continuous_scale=px.colors.sequential.Plasma
                    )
fig.show()




In [None]:
# we can see that in south america life expantancy nearly all similar and same goes for middle asia
# and also intake of alcohol in russia is more in asia

In [None]:
plt.figure(figsize=(25,10))
sns.violinplot(train['Life expectancy '], color='#008000')

In [None]:
plt.figure(figsize=(25,10))
sns.violinplot(train[' BMI '], color='#008000')

In [None]:
plt.figure(figsize=(25,10))
sns.violinplot(train['Adult Mortality'], color='#008000')

In [None]:
plt.scatter(train['Alcohol'], train['Life expectancy '])
plt.show()

In [None]:
plt.scatter(train['percentage expenditure'], train['Life expectancy '])
plt.show()

In [None]:
plt.scatter(train['Hepatitis B'], train['Life expectancy '])
plt.show()

In [None]:
plt.scatter(train[' BMI '], train['Life expectancy '])
plt.show()

In [None]:
plt.scatter(train['GDP'], train['Life expectancy '])
plt.show()

In [None]:
plt.scatter(train['Income composition of resources'], train['Life expectancy '])
plt.show()

In [None]:
plt.scatter(train['Schooling'],train['under-five deaths '])
plt.show()

In [None]:
plt.scatter(train[' HIV/AIDS'],train['Life expectancy '])
plt.show()

# 2. Distribution analysis:

In [None]:
# Method 1

In [None]:
train_2 = pd.read_csv("Desktop/DATASET/Life_Expectancy_Data.csv")
train_2.dropna(axis=0,how='any',thresh=None, subset=None, inplace=True)
train_2 = train_2.drop(['Country'],axis=1)
train_2 = train_2.drop(['Status'],axis=1)

for i in train_2.columns:
    
    x , y = kstest(train_2[i], 'norm') 
    
#     print(x)
    print('P-Value of',i,'is',y)

# below answer is showing that distribution of all the columns are different from normal distribution   

In [None]:
# method 2
# Source
# https://towardsdatascience.com/comparing-sample-distributions-with-the-kolmogorov-smirnov-ks-test-a2292ad6fee5

In [None]:
def ks_norm(sample):
    # Sorts the sample
    sample.sort()
    # Evaluates the KS statistic
    D_ks = [] # KS Statistic list
    for x in sample:
        cdf_normal = stats.norm.cdf(x = x, loc = 0, scale = 1)
        cdf_sample = cdf(sample = sample, x  = x)
        D_ks.append(abs(cdf_normal - cdf_sample))
    ks_stat = max(D_ks)
    # Calculates the P-Value based on the two-sided test
    # The P-Value comes from the KS Distribution Survival Function (SF = 1-CDF)
    p_value = stats.kstwo.sf(ks_stat, len(sample))
    return {"ks_stat": ks_stat, "p_value" : p_value}


# https://towardsdatascience.com/comparing-sample-distributions-with-the-kolmogorov-smirnov-ks-test-a2292ad6fee5
def cdf(sample, x, sort = False):
    # Sorts the sample, if unsorted
    if sort:
        sample.sort()
    # Counts how many observations are below x
    cdf = sum(sample <= x)
    # Divides by the total number of observations
    cdf = cdf / len(sample)
    return cdf

def ks_2samp(sample1, sample2):
    # Gets all observations
    observations = np.concatenate((sample1, sample2))
    observations.sort()
    # Sorts the samples
#     sample1.sort()
#     sample2.sort()
    # Evaluates the KS statistic
    D_ks = [] # KS Statistic list
    for x in observations:
        cdf_sample1 = cdf(sample = sample1, x  = x)
        cdf_sample2 = cdf(sample = sample2, x  = x)
        D_ks.append(abs(cdf_sample1 - cdf_sample2))
    ks_stat = max(D_ks)
    # Calculates the P-Value based on the two-sided test
    # The P-Value comes from the KS Distribution Survival Function (SF = 1-CDF)
    m, n = float(len(sample1)), float(len(sample2))
    en = m * n / (m + n)
    p_value = stats.kstwo.sf(ks_stat, np.round(en))
    return {"ks_stat": ks_stat, "p_value" : p_value}


# Evaluates all possible combinations.
# We want to know if the distributions are identical, so we cannot standardize them
# [, , , ,
#        'Income composition of resources', 'Schooling']
sets = [train_2['Adult Mortality'], train_2[' BMI '], train_2[' HIV/AIDS'], train_2['GDP']]
names = ['Adult Mortality', ' BMI ', ' HIV/AIDS', 'GDP',]
ks_scores = {}
for _ in range(len(names)):
    name1 = names.pop(0)
    sample1 = sets.pop(0)
#     print(name1,sample1)
    for name2, sample2 in zip(names, sets):
        key1 = name1 + "_" +  name2
        key2 = name2 + "_" +  name1
        ks = ks_2samp(sample1, sample2)
        ks_scores[key1] = ks
        ks_scores[key2] = ks
# Prints the results

print(f"Adult Mortality vs BMI: ks = {ks_scores['Adult Mortality_ BMI ']['ks_stat']:.4f} (p-value = {ks_scores['Adult Mortality_ BMI ']['p_value']:.3e}, are equal = {ks_scores['Adult Mortality_ BMI ']['p_value'] > 0.05})")
print(f"Adult Mortality vs HIV/AIDS: ks = {ks_scores['Adult Mortality_ HIV/AIDS']['ks_stat']:.4f} (p-value = {ks_scores['Adult Mortality_ HIV/AIDS']['p_value']:.3e}, are equal = {ks_scores['Adult Mortality_ HIV/AIDS']['p_value'] > 0.05})")
print(f"Adult Mortality vs GDP: ks = {ks_scores['Adult Mortality_GDP']['ks_stat']:.4f} (p-value = {ks_scores['Adult Mortality_GDP']['p_value']:.3e}, are equal = {ks_scores['Adult Mortality_GDP']['p_value'] > 0.05})")
print(f"BMI vs HIV/AIDS: ks = {ks_scores[' BMI _ HIV/AIDS']['ks_stat']:.4f} (p-value = {ks_scores[' BMI _ HIV/AIDS']['p_value']:.3e}, are equal = {ks_scores[' BMI _ HIV/AIDS']['p_value'] > 0.05})")
print(f"BMI vs GDP: ks = {ks_scores[' BMI _GDP']['ks_stat']:.4f} (p-value = {ks_scores[' BMI _GDP']['p_value']:.3e}, are equal = {ks_scores[' BMI _GDP']['p_value'] > 0.05})")
print(f"HIV/AIDS vs GDP: ks = {ks_scores[' HIV/AIDS_GDP']['ks_stat']:.4f} (p-value = {ks_scores[' HIV/AIDS_GDP']['p_value']:.3e}, are equal = {ks_scores[' HIV/AIDS_GDP']['p_value'] > 0.05})")

# 3. Data Scaling:

In [None]:
# Here we are doing data scaling because sacling of data makes it easy to train model
scaler = StandardScaler()

In [None]:
train_1 = train_1.drop(['Country'],axis=1)
train_1 = train_1.drop(['Status'],axis=1)
train_1 = train_1.drop(['3 Code'],axis=1)

In [None]:
# Here i am doing scaling of df_1 dataframe
for col in train_1.columns:
    train_1[col] = scaler.fit_transform(train_1[[col]])

In [None]:
# after scaling
train_1.head()

# 4. Building a Pipeline:

In [None]:
# Data_processing_Visualization class for adding H_Distance,splitting pickup_datetime into Year,day.. and also for removing outliers  
class Data_processing_Visualization(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self,train_3, y=None):
        return self
    
    def transform(self,train_3):
        train_3.dropna(axis=0,how='any',thresh=None, subset=None, inplace=True)
        
        train_3 = train_3[train_3[' BMI ']<=60]
        train_3 = train_3[train_3[' BMI ']>0]
        train_3 = train_3.drop(['Country'],axis=1)
        train_3 = train_3.drop(['Status'],axis=1)
      
        return train_3
#         ========================================================

In [None]:
# Another class for scaling
class Data_Scaling(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self,train_3, y=None):
        return self
    def transform(self,train_3):
        scaler = StandardScaler()
        for col in train_3.columns:
            train_3[col] = scaler.fit_transform(train_3[[col]])
        return train_3  

In [None]:
train_3 = pd.read_csv("Desktop/DATASET/Life_Expectancy_Data.csv")
Data_Pipline = Pipeline([('Data_1',Data_processing_Visualization()),('Data_2',Data_Scaling())])
ans = Data_Pipline.fit_transform(train_3)
ans


# 5. Use of Validation Set and Cross Validation Approach:

In [None]:
# First Method  for validation 

In [None]:
# Here we are fixing value of k for k_cross folding validation
n_splits = 10

In [None]:
#  k-fold cross validation algorithm from scratch to evaluate our model and choose hyper-parameters using libraries

# Here i am declaring some array for futher calculation to store Actual value of Life expectancy and
# Predicted value of Life expectancy 

Actual_value_1 = np.array([])
Predicted_Value_1 = np.array([])

total_range = len(train)//n_splits
Index = 0
Score = 0


while Index<n_splits:    
    test_data = train_1[Index*total_range:Index*total_range+total_range:]
    notcommonseries = train_1[~train_1.isin(test_data)]
    
    train_data = notcommonseries.dropna()
    Life_Expectancy_train = train_data['Life expectancy ']
    del train_data['Life expectancy ']
    data_fit = LinearRegression()
    data_fit.fit(train_data, Life_Expectancy_train)
    Life_Expectancy_test = test_data['Life expectancy ']

    del test_data['Life expectancy ']
    test_prediction = data_fit.predict(test_data)
    df_actual_pred = pd.DataFrame({'Actual': Life_Expectancy_test.squeeze(), 'Predicted': test_prediction.squeeze()})
    Actual= df_actual_pred['Actual']
    predict = df_actual_pred['Predicted']

    Actual_value_1 = np.append (Actual_value_1, Actual)
    Predicted_Value_1 = np.append (Predicted_Value_1, predict)

#     print(Distance_value)
#     print(Actual_value)
#     print(Predicted_Value)
    
    Index+=1  

In [None]:
Final_df_1 =  pd.DataFrame({
    'Actual_Value': Actual_value_1,
    'Predicted_Value':Predicted_Value_1
})
Final_df_1

In [None]:
# Calculation of mean_squared_error 
MSE=np.array((Final_df_1['Actual_Value']-Final_df_1['Predicted_Value'])**2)
print('mean_squared_error is',np.mean(MSE))

In [None]:
# third Method  for validation 

In [None]:
Actual_value_2 = np.array([])
Predicted_Value_2 = np.array([])


total_range = len(train_1)//n_splits

Index = 0
Score = 0
while Index<n_splits:    
    test_data = train_1[Index*total_range:Index*total_range+total_range:]
    notcommonseries = train_1[~train_1.isin(test_data)]

    train_data = notcommonseries.dropna()
    Life_expectancy_train = train_data['Life expectancy ']
    del train_data['Life expectancy ']
    Life_expectancy_test = test_data['Life expectancy ']
    del test_data['Life expectancy ']
    A_MAT = train_data.to_numpy()
    pseudo_inverse = np.linalg.pinv(A_MAT)

    Final_ANS = np.dot(pseudo_inverse,Life_expectancy_train)
    Final_ANS = np.dot(test_data,Final_ANS)


    Actual_value_2 = np.append (Actual_value_2, Life_expectancy_test)
    Predicted_Value_2 = np.append (Predicted_Value_2, Final_ANS)

   
    Index+=1  
   

In [None]:

Final_df_2 =  pd.DataFrame({
    'Actual_Value': Actual_value_2,
    'Predicted_Value':Predicted_Value_2
})
Final_df_2

In [None]:
# Calculation of mean_squared_error 
MSE=np.array((Final_df_2['Actual_Value']-Final_df_2['Predicted_Value'])**2)
print('mean_squared_error is',np.mean(MSE))

# 6. Feature Selection:

**Feature Selection using lasso method regulizer**

In [None]:
# https://www.yourdatateacher.com/2021/05/05/feature-selection-in-machine-learning-using-lasso-regression/#:~:text=How%20can%20we%20use%20it,its%20coefficient%20equal%20to%200.

In [None]:

list_1 = ['Year', 'Adult Mortality', 'infant deaths',
       'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ',
       'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ',
       ' HIV/AIDS', 'GDP', 'Population', ' thinness  1-19 years',
       ' thinness 5-9 years', 'Income composition of resources', 'Schooling']
X=train_1.drop(['Life expectancy '],axis=1)
y=train_1[['Life expectancy ']]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
pipeline = Pipeline([
                     ('scaler',StandardScaler()),
                     ('model',Lasso())
])
Val = GridSearchCV(pipeline,
                      {'model__alpha':np.arange(0.1,10,0.1)},
                      cv = 5, scoring="neg_mean_squared_error",verbose=0
                      )

In [None]:
Val

In [None]:
Val.fit(X_train,y_train)

In [None]:
Val.best_params_

In [None]:
coefficients = Val.best_estimator_.named_steps['model'].coef_


In [None]:
importance = np.abs(coefficients)

In [None]:
# train_1.columns
w = np.array(list_1)[importance > 0]
w

**Feature Selection using OLS**

In [None]:
def get_stats():
#     x = train_1[list_1]
    results = sm.OLS(y_train,X_train).fit()
    print(results.summary())
get_stats()

**Feature Selection using Scikit learn library**

In [None]:
sfs = SequentialFeatureSelector(Linear)

In [None]:
sfs.fit(X_train, y_train)
X_train

In [None]:
sfs.get_support()

In [None]:
train_1 = train_1[['Adult Mortality', ' BMI ', ' HIV/AIDS', 'GDP',
       ' thinness 5-9 years', 'Income composition of resources',
       'Schooling','Life expectancy ']]

In [None]:
train_1

In [None]:
#Error Calculation after feature selection(LinearRegression)
# I am using feature to error calculation which are get from lasso method regulizer 
# because it is giving less feature

In [None]:
y = train_1['Life expectancy ']
X = train_1.drop(['Life expectancy '],axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [None]:
Linear = LinearRegression()

In [None]:
Linear.fit(X_train,y_train)

In [None]:
ANS_Linear = Linear.predict(X_test)
mean_squared_error(ANS_Linear, y_test)