In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

for root, dirs, files in os.walk("/content/drive/MyDrive"):
    for file in files:
        if file.endswith(".xlsx"):
            print(os.path.join(root, file))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from math import ceil

In [None]:
#Surya
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/ML_Project_Data/Life Expectancy Data.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace(' ', '_')
print("Original columns:", df.columns.tolist())

'''# Fixed column mapping - corrected the missing quote and mapping direction
column_mapping = {
    'under-five_deaths': 'under_five_deaths',  # Fixed: added missing quote
    'HIV/AIDS': 'HIV_AIDS',
    'thinness__1-19_years': 'thinness_1_19_years',
    'thinness_5-9_years': 'thinness_5_9_years'
}

# Rename columns
df = df.rename(columns=column_mapping)
print("Fixed columns:", df.columns.tolist())'''

kartik

In [None]:
#kartik
#summary statistics
df.info()  #how many column has non-null, null values and int,float and object datatype
df.describe()  #statistical characters

In [None]:
#data cleaning
##handling missing values
df.isnull().sum()  #checking the null values

In [None]:
#heatmap for the null values
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')  #cbar and cmap for better visualization
plt.title("Missing Values Heatmap")

In [None]:
#checking the skewness
#histplots of skewness of the columns
num_cols = df.select_dtypes(include='number').columns.tolist()  #selecting only the numeric dtype into a list

n_cols = 3  # selecting the no. of columns and no. of rows (using ceil function always increase one values)
from math import ceil
n_rows = ceil(len(num_cols) / n_cols)  # ceiling division

fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 4 * n_rows))
axes = axes.flatten()  # flatten 2D axes to 1D list for easy looping

for idx, col in enumerate(num_cols):  #enumerate gives index ad the item both at the same time
    sns.histplot(df[col], kde=True, ax=axes[idx])
    axes[idx].set_title(f'Histogram of {col}')

for j in range(idx + 1, len(axes)):  # Remove unused subplots
    fig.delaxes(axes[j])  # Clean up empty axes

plt.tight_layout()
plt.show()


In [None]:
#mathematically checking the skewness of the columns
num_cols = df.select_dtypes(include='number').columns.tolist()
for i in num_cols:
    print(f"{i} skewness: {df[i].skew()}")

In [None]:
#checking the hightly skewed columns
highly_skewed = []

for col in num_cols:
    skew_val = df[col].skew()
    if skew_val > 1 or skew_val < -1:
        highly_skewed.append(col)

print("Highly skewed columns:", highly_skewed)


In [None]:
# handling missing values
#imparting the missing values of highly skewed columns with median
#imparting the missing values of normal skewed columns with mode

for col in num_cols:
    if df[col].isnull().sum() > 0:  # Only if there are missing values
        skewness = df[col].skew()
        if -1 < skewness < 1:
            df[col].fillna(df[col].mean(), inplace=True)
        else:
            df[col].fillna(df[col].median(), inplace=True)


In [None]:
# now checking the missing values
df.isnull().sum()

In [None]:
#detecting outliers using boxplots
plt.figure(figsize=(20, 6))
sns.boxplot(data=df[num_cols])
plt.xticks(rotation=45)  # Rotate column names
plt.title("Boxplots of Numerical Features")
plt.show()

In [None]:
#using the IQR methiod to detect and handle the outliers

for col in num_cols:
    Q1 = df[col].quantile(0.25)   # 25th percentile
    Q3 = df[col].quantile(0.75)   # 75th percentile
    IQR = Q3 - Q1                 # Interquartile Range

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR


outliers = df[(df[col] < lower_bound) & (df[col] > upper_bound)]

#replacing the outliers with the upper_bound and the Lower_bound
df[col] = np.where(df[col] > upper_bound, upper_bound,
                np.where(df[col] < lower_bound, lower_bound, df[col]))
print(df)
#outliers are handled

In [None]:
df.isnull().sum()  #cheking if there is any data missing or not

Univariate analysis for numerical and categorical values

In [None]:
#univariate analysis for the numerical columns
plt.figure(figsize=(16,6))
df[num_cols].hist()

In [None]:
#univariate analysis for the categorical columns
cat_cols = df.select_dtypes(include='object').columns.tolist()
cat_cols.remove('Country')
for col in cat_cols:
    plt.figure(figsize=(8, 4))
    df[col].value_counts().plot(kind='bar')
    plt.title(f'Value Counts of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.show()


Surya

Bivariate analysis

In [None]:
# bivariate analysis for the numerical values
import seaborn as sns
import matplotlib.pyplot as plt

"""# Get a list of all the features you want to plot against Life Expectancy
features = df.columns.tolist()
# Remove the target variable, 'Year', 'Status' and 'Country' from the list of features
features.remove('Life_expectancy')
features.remove('Year')
features.remove('Status')
features.remove('Country')

# Get a list of all unique countries
countries = df['Country'].unique()

# Define the number of columns and rows for the subplots
n_cols = 4
n_rows = (len(features) // n_cols) + 1

# Loop through each country
for country in countries:
    # Filter the data to only include the current country
    country_df = df[df['Country'] == country]

    print(f"\n--- Plots for {country} ---")

    # Create a new figure and a set of subplots
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, n_rows * 5))
    axes = axes.flatten()

    # Loop through each feature and its corresponding axis
    for i, feature in enumerate(features):
        # Create the scatter plot on the current subplot axis
        sns.scatterplot(data=country_df, x=feature, y='Life_expectancy', ax=axes[i])

        # Add a title to the subplot
        axes[i].set_title(f'Life Expectancy vs {feature}')
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('Life_expectancy')

    # Remove any empty subplots if the number of features is not a perfect multiple of n_cols
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()"""

Multivariate Analysis

In [None]:
#multivariate Analysis - correlation heatmap
#Understand how all the numbers are connected
import seaborn as sns
import matplotlib.pyplot as plt

correlation_matrix = df.corr(numeric_only=True)
plt.figure(figsize=(15, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='Greens', fmt=".2f")
plt.title('Correlation Matrix of Features')
plt.show()

In [None]:
#kartik

In [None]:
#X = Features (input variables) → All columns used to predict the target.
#y = Target (output variable) → The column you're trying to predict.

In [None]:
#converting the values of categorical column into binary values(0/1)
df['Status'] = df['Status'].map({'Developing': 0, 'Developed': 1})
y = df['Life_expectancy']

'''country_dummies = pd.get_dummies(df['Country'], prefix='Country')'''
X = df.drop(['Life_expectancy','Country', 'Year'], axis=1)
'''df = pd.concat([X, country_dummies], axis=1)'''

print(X.columns)
print('-----------------------------------------------------------------')
print(y.name)

In [None]:
#spliting the data into train and test of X and y
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  #test_size means the amount of data used for testing

#selecting the model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100)

#fitting the data to train the model
model.fit(X_train, y_train)

#predicting
y_preds = model.predict(X_test)



In [None]:
from sklearn.metrics import mean_squared_error, r2_score
rmse = mean_squared_error(y_test, y_preds)
r2  = r2_score(y_test, y_preds)
print(F"Root Mean Squared Error: {rmse}")                      #how far our model's predictions are, on average, from the actual values
print(F"R-squared / Coefficient of Determination: {r2}"  )     #how much variance in the target variable our model can explain

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_preds, alpha=0.5)  #y_test is the actual value and y_preds is the predicted value
plt.xlabel('Actual Life Expectancy')
plt.ylabel('Predicted Life Expectancy')
plt.title('Actual vs Predicted')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  #actual values diagonal line on the top of the scatter plot
plt.show()


In [None]:
import joblib

# Save the trained model
joblib.dump(model, "life_expectancy_model.pkl")


In [None]:
'''from google.colab import files
files.download('life_expectancy_model.pkl')'''


In [None]:
print(X.columns.tolist())


In [None]:
import joblib
from google.colab import drive
drive.mount('/content/drive')

import os

folder_path = '/content/drive/MyDrive/models'

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

import joblib

model_filename = os.path.join(folder_path, 'random_forest_model.joblib')
joblib.dump(model, model_filename)
print("Model saved to", model_filename)


In [None]:
import os
print(os.listdir())

In [None]:
'''# Install packages (run only once)
!pip install fastapi uvicorn nest-asyncio pyngrok joblib

from fastapi import FastAPI
from pydantic import BaseModel
import joblib
import pandas as pd
import nest_asyncio
from pyngrok import ngrok
import uvicorn

class LifeExpInput(BaseModel):
    Status: int  # 0 or 1
    Adult_Mortality: float
    infant_deaths: float
    Alcohol: float
    percentage_expenditure: float
    Hepatitis_B: float
    Measles: float
    BMI: float
    under_five_deaths: float
    Polio: float
    Total_expenditure: float
    Diphtheria: float
    HIV_AIDS: float
    GDP: float
    Population: float
    thinness_1_19_years: float
    thinness_5_9_years: float
    Income_composition_of_resources: float
    Schooling: float


# Load the model from mounted Google Drive path
model = joblib.load('/content/drive/MyDrive/models/random_forest_model.joblib')

app = FastAPI()

@app.post("/predict")
def predict(payload: LifeExpInput):
    data = pd.DataFrame([payload.dict()])
    try:
        prediction = model.predict(data)[0]
    except Exception as e:
        return {"error": str(e)}
    return {"predicted_life_expectancy": float(prediction)}


# Setup and run server with ngrok tunnel
nest_asyncio.apply()

# You can comment out or skip this if it causes errors in Colab
# !fuser -k 8000/tcp

!ngrok config add-authtoken 30oZ34YR1ZCy0CodqMbt72QxEd1_6mGsuDJqkanZyAWuvxg6C

public_url = ngrok.connect(8000)
print("Public URL:", public_url)

uvicorn.run(app, host='0.0.0.0', port=8000)'''

In [None]:
import pandas as pd

# This assumes your DataFrame `df` has been cleaned of missing values and outliers
# Convert categorical columns to numerical using one-hot encoding
# This will create many new columns for each country and for Status
df = pd.get_dummies(df, columns=['Country', 'Status'], drop_first=True)

Linear Regression

In [None]:
print(df.columns)


Index(['Year', 'Life_expectancy', 'Adult_Mortality', 'infant_deaths',
       'Alcohol', 'percentage_expenditure', 'Hepatitis_B', 'Measles', 'BMI',
       'under_five_deaths',
       ...
       'Country_United States of America', 'Country_Uruguay',
       'Country_Uzbekistan', 'Country_Vanuatu',
       'Country_Venezuela (Bolivarian Republic of)', 'Country_Viet Nam',
       'Country_Yemen', 'Country_Zambia', 'Country_Zimbabwe', 'Status_1'],
      dtype='object', length=213)


In [None]:
from sklearn.model_selection import train_test_split

# Define your features (X) and target (y)
X = df.drop('Life_expectancy', axis=1)
y = df['Life_expectancy']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression

# Initialize the model
model = LinearRegression()

# Train the model with the training data
model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Calculate the Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Calculate the R-squared score
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2) score: {r2:.2f}")

Mean Squared Error (MSE): 3.40
Root Mean Squared Error (RMSE): 1.85
R-squared (R2) score: 0.96


In [None]:
import joblib

# Save your trained model to a file
joblib.dump(model, 'linear_regression_model.pkl')

['linear_regression_model.pkl']

Create the Fast API Application

In [None]:
from google.colab import drive
import joblib

# Mount Google Drive
drive.mount('/content/drive')

# Save your trained model to a file in Google Drive
joblib.dump(model, '/content/drive/MyDrive/linear_regression_model.pkl')

KeyboardInterrupt: 

In [None]:
!pip install fastapi uvicorn python-multipart pyngrok

In [None]:
from pyngrok import ngrok
ngrok.set_auth_token("30mGtPmoCmssIMNKLyYPMSMiGBj_3dSTsvzkgyFFMYShGgooh")

In [None]:
%%writefile /content/drive/MyDrive/main.py

from fastapi import FastAPI
import joblib
import pandas as pd

# Load the trained model from Google Drive
model = joblib.load('/content/drive/MyDrive/linear_regression_model.pkl')

# Initialize the FastAPI app
app = FastAPI()

# Create a route to make predictions
@app.post("/predict")
def predict_life_expectancy(data: dict):
    # Convert the input data to a Pandas DataFrame
    input_df = pd.DataFrame([data])

    # Make a prediction using the loaded model
    prediction = model.predict(input_df)

    # Return the prediction
    return {"predicted_life_expectancy": prediction.tolist()}

In [None]:
from pyngrok import ngrok
import subprocess
import time

# Kill any existing ngrok tunnels
ngrok.kill()

# Run FastAPI with Uvicorn on a specific port
print("Starting Uvicorn server...")
process = subprocess.Popen(['uvicorn', 'main:app', '--host', '0.0.0.0', '--port', '8000'])

# Wait a moment for the server to start
time.sleep(5)

# Connect ngrok to the Uvicorn server
public_url = ngrok.connect(8000)
print(f"Your public URL is: {public_url}")

In [None]:
import threading
import uvicorn
from pyngrok import ngrok

# Kill any existing ngrok tunnels
ngrok.kill()

# Define a function to run the FastAPI app
def run_uvicorn():
    uvicorn.run("main:app", host="0.0.0.0", port=8000, log_level="info")

# Run the FastAPI app in a separate thread
thread = threading.Thread(target=run_uvicorn, daemon=True)
thread.start()
print("FastAPI server is running...")

# Connect ngrok to the FastAPI server
public_url = ngrok.connect(8000).public_url
print(f"Your public URL is: {public_url}")

In [None]:
import os
import threading
import uvicorn
from pyngrok import ngrok
import time

# Kill any existing ngrok tunnels
ngrok.kill()

# Change the current directory to your Google Drive folder
os.chdir("/content/drive/MyDrive")

# Define a function to run the FastAPI app
def run_uvicorn():
    uvicorn.run("main:app", host="0.0.0.0", port=8000, log_level="info")

# Run the FastAPI app in a separate thread
thread = threading.Thread(target=run_uvicorn, daemon=True)
thread.start()
print("FastAPI server is running...")

# Wait a moment for the server to start
time.sleep(5)

# Connect ngrok to the FastAPI server
public_url = ngrok.connect(8000).public_url
print(f"Your public URL is: {public_url}")

In [None]:
from pyngrok import ngrok
ngrok.kill()

In [None]:
print(X_train.columns.tolist())