In [53]:
# Importing necessary Libraries
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import binom
import matplotlib.pyplot as plt
from scipy.stats import poisson
from scipy.stats import norm

# Libraries for ML
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.simplefilter('ignore')

In [54]:
# Reading the Dataset
data = pd.read_csv('Data/Estimated Population (Persons in April).csv')
data.head(4)

Unnamed: 0,STATISTIC,STATISTIC Label,TLIST(A1),Year,C02076V02508,Age Group,C02199V02655,Sex,C02196V04140,Region,UNIT,VALUE
0,PEA04,Estimated Population (Persons in April),2011,2011,205,0 - 4 years,-,Both sexes,-,State,Thousand,356.0
1,PEA04,Estimated Population (Persons in April),2011,2011,205,0 - 4 years,-,Both sexes,B1A65D7C-1984-4A87-AD58-0E846812C992,Border,Thousand,30.7
2,PEA04,Estimated Population (Persons in April),2011,2011,205,0 - 4 years,-,Both sexes,A69CA800-8D87-4920-A7C1-50426A1D39B4,West,Thousand,32.6
3,PEA04,Estimated Population (Persons in April),2011,2011,205,0 - 4 years,-,Both sexes,42C5C2A5-2D71-4BD1-BDB5-BD7D3198CD78,Mid-West,Thousand,35.0


In [55]:
# Removing unnecessary column showing no information
columns = ['STATISTIC', 'STATISTIC Label', 'TLIST(A1)', 'C02076V02508', 'C02199V02655', 'C02196V04140', "UNIT"]
data.drop(columns=columns, inplace=True)
data.head(5)

Unnamed: 0,Year,Age Group,Sex,Region,VALUE
0,2011,0 - 4 years,Both sexes,State,356.0
1,2011,0 - 4 years,Both sexes,Border,30.7
2,2011,0 - 4 years,Both sexes,West,32.6
3,2011,0 - 4 years,Both sexes,Mid-West,35.0
4,2011,0 - 4 years,Both sexes,South-East,32.0


# 1- Statistics

## a- Descriptive Statistics

In [56]:
# Excuding the Aggregated Data
# 1- 'All ages' from 'Age Group' column
# 2- "State" from "Region" Column
data = data[data['Age Group'] != "All ages"]
data = data[data['Region'] != 'State']
data = data[data['Sex'] != "Both sexes"]

In [57]:
# Function to perform all EDA
def perform_eda(df, name=""):
    # Printing basic detail of data like name, size, shape
    print(f"EDA of {str(name)} Data....")
    print(f"Size {df.size}")
    print(f"Columns {df.shape[1]}")
    print(f"Records {df.shape[0]}")
    print("="*50)
    
    # Printing top 5 records of data
    print("First Look of Data....")
    display(df.head())
    print("="*50)
    
    # Getting Numerical and Categorical columns Separately
    cat_cols = df.select_dtypes(object).columns
    num_cols = df.select_dtypes(np.number).columns

    # Printing the Numerical columns
    print("Dataset has following Numerical columns...")
    for i, j in enumerate(num_cols):
        print(f" {i+1}) {j}")

    # Printing the Categorical columns
    print("\n\nDataset has following Categorical columns...")
    for i, j in enumerate(cat_cols):
        print(f" {i+1}) {j}")
    
    # Printing info of data like data type, non null values
    print("="*50)
    print("Information of Data....")
    print(df.info())
    print("="*50)
    
    # Printing info of data like data type, non null values
    print("="*50)
    print("Check for Null Values....")
    print(df.isnull().sum())
    print("="*50)
    
    # Displaying statistical properties of data like mean, median, max, min
    print("Statistical Properties of Data....")
    display(df.describe(include="all"))
    print("="*50)

In [58]:
perform_eda(data, "Estimated Population")

EDA of Estimated Population Data....
Size 18720
Columns 5
Records 3744
First Look of Data....


Unnamed: 0,Year,Age Group,Sex,Region,VALUE
10,2011,0 - 4 years,Male,Border,15.8
11,2011,0 - 4 years,Male,West,16.6
12,2011,0 - 4 years,Male,Mid-West,18.0
13,2011,0 - 4 years,Male,South-East,16.4
14,2011,0 - 4 years,Male,South-West,25.5


Dataset has following Numerical columns...
 1) Year
 2) VALUE


Dataset has following Categorical columns...
 1) Age Group
 2) Sex
 3) Region
Information of Data....
<class 'pandas.core.frame.DataFrame'>
Index: 3744 entries, 10 to 6641
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Year       3744 non-null   int64  
 1   Age Group  3744 non-null   object 
 2   Sex        3744 non-null   object 
 3   Region     3744 non-null   object 
 4   VALUE      3744 non-null   float64
dtypes: float64(1), int64(1), object(3)
memory usage: 175.5+ KB
None
Check for Null Values....
Year         0
Age Group    0
Sex          0
Region       0
VALUE        0
dtype: int64
Statistical Properties of Data....


Unnamed: 0,Year,Age Group,Sex,Region,VALUE
count,3744.0,3744,3744,3744,3744.0
unique,,18,2,8,
top,,0 - 4 years,Male,Border,
freq,,208,1872,468,
mean,2017.0,,,,16.848691
std,3.742157,,,,11.43146
min,2011.0,,,,1.2
25%,2014.0,,,,10.2
50%,2017.0,,,,14.3
75%,2020.0,,,,20.0




## b- Visualizations