<a href="https://colab.research.google.com/github/tugcecalisir/IBB_Data_Analysis/blob/main/Istanbul's_Population_Analysis_By_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Population Prediction Of Istanbul By Year**
**Table Of Content**

>[Preparation](#scrollTo=NW1pWc3HSy8R)

>>[Import Libraries](#scrollTo=FzauI15kTqTM)

>>[Data Loading](#scrollTo=fexK2EGZTOEO)

>[Feature Engineering](#scrollTo=X8JuSF1KT-2D)

>>[Visualizations](#scrollTo=XS0sQ8qGUgO2)

>[Model Implementation](#scrollTo=E6ztYrq0zaCV)

>>[Linear Regression](#scrollTo=vzD-m6Yozgl8)

>>>[Data Split](#scrollTo=8Rc1u35jVzez)

>>>[Model Training](#scrollTo=hnO-Q7bPcPFW)

>>>[Model Prediction](#scrollTo=fM7eUfxXnz4t)

>>>[Model Visualizations](#scrollTo=Ou-sj3t4npcV)

>>>[Comparison Linear Regression with R2 Score](#scrollTo=Fmi63Ktlyo8F)

>>[Polynomial Linear Regression](#scrollTo=njdIReRxb7Bf)

>>>[Data Split](#scrollTo=oMkVh95vpJgs)

>>>[Model Training](#scrollTo=RKJaweuNpea9)

>>>[Model Visualizations](#scrollTo=zPCG3fgbpRn0)

>>>[Accuracy Of Polynomial Regression](#scrollTo=fgfvmHXn1xGl)



# Preparation

## Import Libraries

In [77]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import matplotlib as mpl
import seaborn as sns
import plotly.graph_objects as go
import plotly.offline as po
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.express as px
import plotly.figure_factory as ff 
import statsmodels.api as sm
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report

## Data Loading

In [78]:
url = 'https://raw.githubusercontent.com/tugcecalisir/IBB_Data_Analysis/main/Datasets/nufus2008_2021.csv' 
df_nufus = pd.read_csv(url,sep = ',')
df_nufus.head()

Unnamed: 0,ilce,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Adalar,14072,14341,14221,13883,14552,16166,16052,15623,14478,14907,16119,15238,16033,16372
1,Arnavutköy,163510,175871,188011,198230,206299,215531,225670,236222,247507,261655,270549,282488,296709,312023
2,Ataşehir,351046,361615,375208,387502,395758,405974,408986,419368,422513,423372,416318,425094,422594,427217
3,Avcılar,333944,348635,364682,383736,395274,407240,417852,425228,430770,435682,435625,448882,436897,457981
4,Bağcılar,720819,724268,738809,746650,749024,752250,754623,757162,751510,748483,734369,745125,737206,744351


# Feature Engineering

In [79]:
df_nufus = np.transpose(df_nufus)
df_nufus.columns = df_nufus.iloc[0]
df_nufus.drop(df_nufus.index[0],inplace=True)
df_nufus = df_nufus.reset_index()
df_nufus.rename(columns = {'index':'year'}, inplace = True)
df_nufus.head()

ilce,year,Adalar,Arnavutköy,Ataşehir,Avcılar,Bağcılar,Bahçelievler,Bakırköy,Başakşehir,Bayrampaşa,...,Sarıyer,Silivri,Sultanbeyli,Sultangazi,Şile,Şişli,Tuzla,Ümraniye,Üsküdar,Zeytinburnu
0,2008,14072,163510,351046,333944,720819,571683,214810,207542,268276,...,277372,124601,282026,444295,28571,312666,170453,553935,524889,288058
1,2009,14341,175871,361615,348635,724268,576799,218352,226387,269425,...,278527,134660,286622,452563,28325,316058,181658,573265,524379,290147
2,2010,14221,188011,375208,364682,738809,590063,219145,248467,269481,...,280802,138797,291063,468274,28119,317337,185819,603431,526947,292430
3,2011,13883,198230,387502,383736,746650,600900,220663,284488,269709,...,287309,144781,298143,483225,28847,320763,197230,631603,532182,293228
4,2012,14552,206299,395758,395274,749024,600162,221336,316176,269774,...,289959,150183,302388,492212,30218,318217,197657,645238,535916,292407


In [80]:
dataset=df_nufus[['year','Ataşehir']]
dataset.head()

ilce,year,Ataşehir
0,2008,351046
1,2009,361615
2,2010,375208
3,2011,387502
4,2012,395758


## Visualizations

In [81]:
fig = px.line(dataset, x='year', y='Ataşehir', title='Population Of Ataşehir by Years')
fig.show()

In [82]:
fig = px.line(df_nufus, x='year', y='Küçükçekmece', title='Population Of Küçükçekmece by Years')
fig.show()

# Model Implementation

## Linear Regression

### Data Split

In [83]:
X = dataset.drop(['Ataşehir'], axis = True) #independent variable
y = dataset['Ataşehir'] #dependent variable

In [84]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42) 

In [85]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((9, 1), (5, 1), (9,), (5,))

In [86]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=X_train['year'].sort_index(), y=y_train.sort_index(), 
                         mode='markers', name='train data', marker=dict(color='red', size=5)))

fig.show()

### Model Training

In [87]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [88]:
regressor.predict(X_train)

array([412210.34767025, 380407.63799283, 375107.18637993, 438712.60573477,
       391008.54121864, 406909.89605735, 422811.25089606, 385708.08960574,
       401609.44444444])

### Model Prediction

In [89]:
y_pred = regressor.predict(X_test).round(1)

In [90]:
y_pred

array([417510.8, 428111.7, 369806.7, 433412.2, 396309. ])

In [91]:
hesaplama = pd.DataFrame(np.c_[y_test, y_pred], columns = ["Original Population", "Predicted Population"]) 
hesaplama

Unnamed: 0,Original Population,Predicted Population
0,423372,417510.8
1,425094,428111.7
2,351046,369806.7
3,422594,433412.2
4,405974,396309.0


In [92]:
df_train = X_train
df_train = pd.concat([df_train, y_train.to_frame() ], axis=1)
df_train = df_train.reset_index()
df_train = pd.concat([df_train, pd.DataFrame(regressor.predict(X_train), columns = ['Predicted_Population'])], axis=1)
df_train = df_train.set_index('index').sort_index()
df_train.rename(columns = {'Ataşehir':'Original_Population'}, inplace = True)
df_train

Unnamed: 0_level_0,year,Original_Population,Predicted_Population
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2009,361615,375107.18638
2,2010,375208,380407.637993
3,2011,387502,385708.089606
4,2012,395758,391008.541219
6,2014,408986,401609.444444
7,2015,419368,406909.896057
8,2016,422513,412210.34767
10,2018,416318,422811.250896
13,2021,427217,438712.605735


In [17]:
df_test = X_test
df_test = pd.concat([df_test, y_test.to_frame() ], axis=1)
df_test = df_test.reset_index()
df_test = pd.concat([df_test, pd.DataFrame(regressor.predict(X_test), columns = ['Predicted_Population'])], axis=1)
df_test = df_test.set_index('index').sort_index()
df_test.rename(columns = {'Ataşehir':'Original_Population'}, inplace = True)
df_test

Unnamed: 0_level_0,year,Original_Population,Predicted_Population
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2008,351046,369806.734767
5,2013,405974,396308.992832
9,2017,423372,417510.799283
11,2019,425094,428111.702509
12,2020,422594,433412.154122


### Model Visualizations

In [18]:
import plotly.graph_objs as go

fig = go.Figure()

# scatter plot
fig.add_trace(go.Scatter(x=df_train['year'], y=df_train['Original_Population'], mode='markers', name='Test Data', marker=dict(color='red')))

# line plot
fig.add_trace(go.Scatter(x=df_train['year'], y=df_train['Predicted_Population'], mode='lines', name='Regression Line', line=dict(color='blue')))

fig.update_layout(title='Population vs Year (Train Set)', xaxis_title='Year')


fig.show()

In [19]:
import plotly.express as px

fig = px.scatter(dataset, x="year", y="Ataşehir", trendline="ols")
fig.show()

In [20]:
import plotly.express as px

fig = px.scatter(dataset, x="year", y="Ataşehir",  trendline="ewm", trendline_options=dict(halflife=2),
                title="Population vs Year (Train Set)")
fig.show()

In [21]:
import plotly.graph_objs as go

fig = go.Figure()

# scatter plot
fig.add_trace(go.Scatter(x=df_test['year'], y=df_test['Original_Population'], mode='markers', name='Test Data', marker=dict(color='red')))

# line plot
fig.add_trace(go.Scatter(x=df_test['year'], y=df_test['Predicted_Population'], mode='lines', name='Regression Line', line=dict(color='blue')))

fig.update_layout(title='Population vs Year (Test Set)', xaxis_title='Year')


fig.show()

### Comparison Linear Regression with R2 Score

In [22]:
y_pred = regressor.predict(X_test)

In [23]:
regressor.score(X, y)
# R^2 modelin açıklanabilirliği ile ilgili bir değerdir. R^2 elimizdeki bağımsız değişkenleri kullandığımızda bağımlı değişkendeki değişimin yüzde kaçını açıklayabiliyoruz bilgisini bize sunar.

0.8381266298733376

In [24]:
original_y = y[0:10]

In [25]:
predicted_y = pd.DataFrame(regressor.predict(X)[0:10])

In [26]:
comparison = pd.concat([original_y, predicted_y], axis = 1)
comparison.columns = ["original_y", "predicted_y"]
comparison["error"] = comparison["original_y"] - comparison["predicted_y"]
comparison

Unnamed: 0,original_y,predicted_y,error
0,351046,369806.734767,-18760.734767
1,361615,375107.18638,-13492.18638
2,375208,380407.637993,-5199.637993
3,387502,385708.089606,1793.910394
4,395758,391008.541219,4749.458781
5,405974,396308.992832,9665.007168
6,408986,401609.444444,7376.555556
7,419368,406909.896057,12458.103943
8,422513,412210.34767,10302.65233
9,423372,417510.799283,5861.200717


Mean Squared Error

In [27]:
MSE = mean_squared_error(y, regressor.predict(X)) # Mean Squared Error 
MSE

95056773.1697885

Root Mean Square Error 

In [28]:
RMSE = np.sqrt(MSE) #Root Mean Square Error
RMSE

9749.706311976197

## Polynomial Linear Regression

### Data Split

In [167]:
X = dataset.drop(['Ataşehir'], axis = True)
y = dataset['Ataşehir']

In [168]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [169]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [170]:
print("Training acc : ", lin_reg.score(X_train, y_train))
print("Testing acc : ", lin_reg.score(X_test, y_test))

Training acc :  0.8046055198886657
Testing acc :  0.8540847672881884


### Model Training

In [171]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 4) 
X_poly = poly_reg.fit_transform(X_train)
lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_poly, y_train)
X_poly_test = poly_reg.transform(X_test)

### Model Visualizations

In [173]:
y_train = y_train.to_frame()
y_train = y_train.reset_index()
y_train= y_train.drop(['index'], axis=1)

In [174]:
df_poly_train = pd.DataFrame(X_poly)
df_poly_train.rename(columns = {1:'year'}, inplace = True)
df_poly_train = pd.concat([df_poly_train, y_train], axis=1)
df_poly_train = pd.concat([df_poly_train, pd.DataFrame(lin_reg_2.predict(poly_reg.fit_transform(X_train)), columns = ['Predicted_Population'])], axis=1)
df_poly_train.rename(columns = {'Ataşehir':'Original_Population'}, inplace = True)
df_poly_train = df_poly_train.sort_values(by=['year'])
df_poly_train

Unnamed: 0,0,year,2,3,4,Original_Population,Predicted_Population
4,1.0,2009.0,4036081.0,8108487000.0,16289950000000.0,361615,360286.628052
3,1.0,2010.0,4040100.0,8120601000.0,16322410000000.0,375208,375699.92511
9,1.0,2011.0,4044121.0,8132727000.0,16354910000000.0,387502,388166.832489
6,1.0,2012.0,4048144.0,8144866000.0,16387470000000.0,395758,398025.33046
1,1.0,2013.0,4052169.0,8157016000.0,16420070000000.0,405974,405613.907318
10,1.0,2014.0,4056196.0,8169179000.0,16452730000000.0,408986,411271.56015
7,1.0,2015.0,4060225.0,8181353000.0,16485430000000.0,419368,415337.793991
2,1.0,2016.0,4064256.0,8193540000.0,16518180000000.0,422513,418152.623108
8,1.0,2018.0,4072324.0,8217950000.0,16583820000000.0,416318,421390.664337
0,1.0,2020.0,4080400.0,8242408000.0,16649660000000.0,422594,423715.963425


In [175]:
import plotly.graph_objs as go

fig = go.Figure()

# scatter plot
fig.add_trace(go.Scatter(x=df_poly_train['year'], y=df_poly_train['Original_Population'], mode='markers', name='Test Data', marker=dict(color='red')))

# line plot
fig.add_trace(go.Scatter(x=df_poly_train['year'], y=df_poly_train['Predicted_Population'], mode='lines', name='Regression Line', line=dict(color='blue')))

fig.update_layout(title='Population vs Year (Train Set)', xaxis_title='Year')


fig.show()

### Accuracy Of Polynomial Regression

In [172]:
print("Training acc : ", lin_reg_2.score(X_poly, y_train))
print("Testing acc : ", lin_reg_2.score(X_poly_test, y_test))

Training acc :  0.9829661226929639
Testing acc :  0.9700016499149674
