In [3]:
import numpy as np 
import pandas as pd 
import plotly.express as px

# Understanding the DataSet 🚨

In [4]:
data = pd.read_csv('all_countries.csv')

In [5]:
data.head()

Unnamed: 0,Country,Region,Population,Area (sq. mi.),Pop. Density (per sq. mi.),Coastline (coast/area ratio),Net migration,Infant mortality (per 1000 births),GDP ($ per capita),Literacy (%),Phones (per 1000),Arable (%),Crops (%),Other (%),Climate,Birthrate,Deathrate,Agriculture,Industry,Service
0,Afghanistan,ASIA (EX. NEAR EAST),31056997,647500,480,0,2306,16307,700.0,360,32,1213,22,8765,1,466,2034,38.0,24.0,38.0
1,Albania,EASTERN EUROPE,3581655,28748,1246,126,-493,2152,4500.0,865,712,2109,442,7449,3,1511,522,232.0,188.0,579.0
2,Algeria,NORTHERN AFRICA,32930091,2381740,138,4,-39,31,6000.0,700,781,322,25,9653,1,1714,461,101.0,6.0,298.0
3,American Samoa,OCEANIA,57794,199,2904,5829,-2071,927,8000.0,970,2595,10,15,75,2,2246,327,,,
4,Andorra,WESTERN EUROPE,71201,468,1521,0,66,405,19000.0,1000,4972,222,0,9778,3,871,625,,,


In [6]:
data.shape

(227, 20)

> We came to know that this dataset has 227 rows and 20 columns.
> Now we need more information about in which datatype they are stored

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227 entries, 0 to 226
Data columns (total 20 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Country                             227 non-null    object 
 1   Region                              227 non-null    object 
 2   Population                          227 non-null    int64  
 3   Area (sq. mi.)                      227 non-null    int64  
 4   Pop. Density (per sq. mi.)          227 non-null    object 
 5   Coastline (coast/area ratio)        227 non-null    object 
 6   Net migration                       224 non-null    object 
 7   Infant mortality (per 1000 births)  224 non-null    object 
 8   GDP ($ per capita)                  226 non-null    float64
 9   Literacy (%)                        209 non-null    object 
 10  Phones (per 1000)                   223 non-null    object 
 11  Arable (%)                          225 non-n

### Observations:
1. Country and Region are the only 2 columns which are non-numerical
2. In those numerical columns, GDP ($ per capita) is the only one which has float values
3. Population and Area are integers
4. Remaining all are in object type, which doesn't help us in this format
5. We will convert all object type columns into float
6. We need to shorten the column names also, which is seems longgggg

> Changing the names of the columns

In [8]:
data.columns=(["country","region","population","area","density","coastline_area_ratio","net_migration","infant_mortality","gdp_per_capita",
                  "literacy","phones","arable","crops","other","climate","birthrate","deathrate","agriculture","industry",
                  "service"])

In [9]:
print(data.columns)

Index(['country', 'region', 'population', 'area', 'density',
       'coastline_area_ratio', 'net_migration', 'infant_mortality',
       'gdp_per_capita', 'literacy', 'phones', 'arable', 'crops', 'other',
       'climate', 'birthrate', 'deathrate', 'agriculture', 'industry',
       'service'],
      dtype='object')


### Fixing datatypes
- converting into float/string types

In [10]:
data.country = data.country.astype('category')
data.region = data.region.astype('category')

> We will first convert obj -> str then str -> float

In [11]:
data.density = data.density.astype(str)
data.density = data.density.str.replace(',','.').astype(float)

data.coastline_area_ratio = data.coastline_area_ratio.astype(str)
data.coastline_area_ratio = data.coastline_area_ratio.str.replace(',','.').astype(float)

data.net_migration = data.net_migration.astype(str)
data.net_migration = data.net_migration.str.replace(',','.').astype(float)

data.infant_mortality = data.infant_mortality.astype(str)
data.infant_mortality = data.infant_mortality.str.replace(',','.').astype(float)

data.literacy = data.literacy.astype(str)
data.literacy = data.literacy.str.replace(',','.').astype(float)

data.phones = data.phones.astype(str)
data.phones = data.phones.str.replace(',','.').astype(float)

data.service = data.service.astype(str)
data.service = data.service.str.replace(',','.').astype(float)

data.industry = data.industry.astype(str)
data.industry = data.industry.str.replace(',','.').astype(float)

data.agriculture = data.agriculture.astype(str)
data.agriculture = data.agriculture.str.replace(',','.').astype(float)

data.deathrate = data.deathrate.astype(str)
data.deathrate = data.deathrate.str.replace(',','.').astype(float)

data.birthrate = data.birthrate.astype(str)
data.birthrate = data.birthrate.str.replace(',','.').astype(float)

data.climate = data.climate.astype(str)
data.climate = data.climate.str.replace(',','.').astype(float)

data.arable = data.arable.astype(str)
data.arable = data.arable.str.replace(',','.').astype(float)

data.crops = data.crops.astype(str)
data.crops = data.crops.str.replace(',','.').astype(float)

data.other = data.other.astype(str)
data.other = data.other.str.replace(',','.').astype(float)

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227 entries, 0 to 226
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   country               227 non-null    category
 1   region                227 non-null    category
 2   population            227 non-null    int64   
 3   area                  227 non-null    int64   
 4   density               227 non-null    float64 
 5   coastline_area_ratio  227 non-null    float64 
 6   net_migration         224 non-null    float64 
 7   infant_mortality      224 non-null    float64 
 8   gdp_per_capita        226 non-null    float64 
 9   literacy              209 non-null    float64 
 10  phones                223 non-null    float64 
 11  arable                225 non-null    float64 
 12  crops                 225 non-null    float64 
 13  other                 225 non-null    float64 
 14  climate               205 non-null    float64 
 15  birthr

### Now all seems okay to proceed for statistical analysis

In [13]:
data.describe()

Unnamed: 0,population,area,density,coastline_area_ratio,net_migration,infant_mortality,gdp_per_capita,literacy,phones,arable,crops,other,climate,birthrate,deathrate,agriculture,industry,service
count,227.0,227.0,227.0,227.0,224.0,224.0,226.0,209.0,223.0,225.0,225.0,225.0,205.0,224.0,223.0,212.0,211.0,212.0
mean,28740280.0,598227.0,379.047137,21.16533,0.038125,35.506964,9689.823009,82.838278,236.061435,13.797111,4.564222,81.638311,2.139024,22.114732,9.241345,0.150844,0.282711,0.565283
std,117891300.0,1790282.0,1660.185825,72.286863,4.889269,35.389899,10049.138513,19.722173,227.991829,13.040402,8.36147,16.140835,0.699397,11.176716,4.990026,0.146798,0.138272,0.165841
min,7026.0,2.0,0.0,0.0,-20.99,2.29,500.0,17.6,0.2,0.0,0.0,33.33,1.0,7.29,2.29,0.0,0.02,0.062
25%,437624.0,4647.5,29.15,0.1,-0.9275,8.15,1900.0,70.6,37.8,3.22,0.19,71.65,2.0,12.6725,5.91,0.03775,0.193,0.42925
50%,4786994.0,86600.0,78.8,0.73,0.0,21.0,5550.0,92.5,176.2,10.42,1.03,85.7,2.0,18.79,7.84,0.099,0.272,0.571
75%,17497770.0,441811.0,190.15,10.345,0.9975,55.705,15700.0,98.0,389.65,20.0,4.44,95.44,3.0,29.82,10.605,0.221,0.341,0.6785
max,1313974000.0,17075200.0,16271.5,870.66,23.06,191.19,55100.0,100.0,1035.6,62.11,50.68,100.0,4.0,50.73,29.74,0.769,0.906,0.954


In [14]:

data.isnull().sum()

country                  0
region                   0
population               0
area                     0
density                  0
coastline_area_ratio     0
net_migration            3
infant_mortality         3
gdp_per_capita           1
literacy                18
phones                   4
arable                   2
crops                    2
other                    2
climate                 22
birthrate                3
deathrate                4
agriculture             15
industry                16
service                 15
dtype: int64

In [15]:
fig = px.bar(data.columns,x=data.columns,y=data.isnull().sum(),title="Number of missing values",text_auto='.2s')
fig.update_traces(textfont_size=12,textangle=0,textposition="outside",cliponaxis=False)
fig.show()

In [16]:

fig = px.imshow(data.isnull())
fig.show()

### Observations:
- Climate column has maximum missing values
- Initial 6 columns have no missing values!

# Checking Data Validity ✅
We will randomly select some attributes to compare on internet to check whether are they believable or not

- (p) => population
- (a) => area
- (c) => coastline/Area
- (g) => GDP Source information

- > https://www.jetpunk.com/info/countries-by-coastline
- > https://en.wikipedia.org/

Internet data:
1. Afghanistan
- (p) = ~ 40M
- (a) = ~ 6L /km2
- (c) => 0
- (g) => ~ 1400 crores USD

2. France
- (p) = ~ 66M
- (a) = ~ 5L /km2
- (c) => 7.58m/km2
- (g) => ~ 2.96 lakh crore USD

3. Spain
- (p) = ~ 4.7 crores
- (a) = ~ 5L /km2   
- (c) => 7.03m/km2
- (g) => ~ 1.43 lakh crore USD

In [17]:
data.loc[[0,69,190],['country','population','area','coastline_area_ratio','gdp_per_capita']]

Unnamed: 0,country,population,area,coastline_area_ratio,gdp_per_capita
0,Afghanistan,31056997,647500,0.0,700.0
69,France,60876136,547030,0.63,27600.0
190,Spain,40397842,504782,0.98,22000.0


### Observation:
- Almost looks near to actual values with the data we have
- altough there are slight dissimilarities quite a few, but not much to ruin our analysis
-  data points are seems OLD, hence not updated to this date such as population,areas etc

# Identifying undefined features 🔍
 
### what does the attributes such as climate,aggriculure, industry and service refer to?

In [18]:
data.loc[:,['country','region','climate','agriculture','industry','service']].tail()

Unnamed: 0,country,region,climate,agriculture,industry,service
222,West Bank,NEAR EAST,3.0,0.09,0.28,0.63
223,Western Sahara,NORTHERN AFRICA,1.0,,,0.4
224,Yemen,NEAR EAST,1.0,0.135,0.472,0.393
225,Zambia,SUB-SAHARAN AFRICA,2.0,0.22,0.29,0.489
226,Zimbabwe,SUB-SAHARAN AFRICA,2.0,0.179,0.243,0.579


### Observations:
- If we add agriculture,industry and service, the sum becomes 1
- Which means they are percentage(%) values (9+28+63 = 100)

> understaning climate column

In [19]:
sorted(data.climate.unique())

[1.0, 1.5, 2.0, 2.5, 3.0, nan, 4.0]

In [20]:
one=data.loc[:, ['country', 'region', 'climate']][data.climate == 1].head()
two=data.loc[:, ['country', 'region', 'climate']][data.climate == 1.5].head()
three=data.loc[:, ['country', 'region', 'climate']][data.climate == 2].head()
four=data.loc[:, ['country', 'region', 'climate']][data.climate == 2.5].head()
five=data.loc[:, ['country', 'region', 'climate']][data.climate == 3].head()
six=data.loc[:, ['country', 'region', 'climate']][data.climate == 4].head()

In [21]:
pd.concat([one,two,three,four,five,six])

Unnamed: 0,country,region,climate
0,Afghanistan,ASIA (EX. NEAR EAST),1.0
2,Algeria,NORTHERN AFRICA,1.0
11,Australia,OCEANIA,1.0
13,Azerbaijan,C.W. OF IND. STATES,1.0
15,Bahrain,NEAR EAST,1.0
24,Bolivia,LATIN AMER. & CARIB,1.5
35,Cameroon,SUB-SAHARAN AFRICA,1.5
42,China,ASIA (EX. NEAR EAST),1.5
63,Eritrea,SUB-SAHARAN AFRICA,1.5
107,Kenya,SUB-SAHARAN AFRICA,1.5


### Observations:
1. Climate has 6 unique values along with  missing ones too
2. we can assume countries with climate value as 1 are having more desert in them (Afghanistan,Australia...)
3. Tropical => climate 2
4. cold/cool => cilmate 3
5. hot + tropical => climate 1.5
6. tropical + cold => climate 2.5
7. Countries having climate 4 are also can be added to cold/cool, but it is nowhere mentioned in the dataset, we will deal with this later
8. 226 - 194 = 22
means there are 22 null values present, so these must be replaced by 0 or something new value

# Data Cleaning 🧹

In [22]:
data.isnull().sum()

country                  0
region                   0
population               0
area                     0
density                  0
coastline_area_ratio     0
net_migration            3
infant_mortality         3
gdp_per_capita           1
literacy                18
phones                   4
arable                   2
crops                    2
other                    2
climate                 22
birthrate                3
deathrate                4
agriculture             15
industry                16
service                 15
dtype: int64

1. net_migration, infant_mortality only 3 values missing, which  belong to small nations so we can  fill as 0
2. West Sahara country's gdp_per_capita is missing, as per the internet it is showingn $2500, we will replace by this value
3. literacy has 18, we fill them by its mean value
4. phone will be replaced by mean of it
5. arable,crops and other have 4,2,2 missing respectively, each will be replaced by 0
6. Climate has maximum number of missing  values (22), will be replaced by 0 (unknown climate)
7. birthrate and deathrate are calucated per 1000, not population based. so these can be updated by mean value only
8. Agriculture, service and industry have 15-16 missing values.
All belong to smaller nations which are heavily dependant upon service and less on agriculture and industry,So
- agriculture => 0.15
- industry => 0.05
- service => 0.8

In [23]:
data['net_migration'].fillna(0,inplace=True)
data['infant_mortality'].fillna(0,inplace=True)
data['arable'].fillna(0,inplace=True)
data['crops'].fillna(0,inplace=True)
data['other'].fillna(0,inplace=True)
data['climate'].fillna(0,inplace=True)

In [24]:
data['gdp_per_capita'].fillna(2500,inplace=True)
data['literacy'].fillna(data.groupby('region')['literacy'].transform('mean'),inplace=True)
data['phones'].fillna(data.groupby('region')['phones'].transform('mean'),inplace=True)
data['birthrate'].fillna(data.groupby('region')['birthrate'].transform('mean'),inplace=True)
data['deathrate'].fillna(data.groupby('region')['deathrate'].transform('mean'),inplace=True)

In [25]:
data['agriculture'].fillna(0.17,inplace=True)
data['service'].fillna(0.8,inplace=True)
data['industry'].fillna((1-data['agriculture'] - data['service']),inplace=True)

In [26]:
data.isnull().sum()

country                 0
region                  0
population              0
area                    0
density                 0
coastline_area_ratio    0
net_migration           0
infant_mortality        0
gdp_per_capita          0
literacy                0
phones                  0
arable                  0
crops                   0
other                   0
climate                 0
birthrate               0
deathrate               0
agriculture             0
industry                0
service                 0
dtype: int64

# Exploratory Data Analysis (EDA) 🔭

> Understanding the correlation

In [72]:

fig = px.imshow(data.corr(),text_auto=True, aspect="auto")
fig.show()

### Observations:
- Strong correlations are,
    1. infant_mortality & birthrate
    2. infant_mortality & literacy
    3. gdp_per_capita & phones
    4. arable & other than crops
    5. birthrate & literacy (less literacy = higher the birthrate)
- Weak correlations are,
    1. infant_mortality & agriculture
    2. birthrate & phones
    3. gdp_per_capita & birthrate

### We are going in depth with more features simultaneously side by side

In [28]:

fig = px.scatter_matrix(data, dimensions=data[['population', 'area', 'net_migration', 'gdp_per_capita', 'climate']],width=700, height=720,title="Features relationships",color="gdp_per_capita")
fig.show()

### Observations:
- net_migration & gdp_per_capita has good correlations, which means migrants always prefers to move to the countries having better economy and growth which is gdp in our case.
- climate and populations are less correlated, means people avoid extreme weather and climate places
- as area increased the amount of migratants also increased, obvious.

# Regional Analysis on multiple features 🔬

In [77]:
# for visualization
data.to_csv('filtered_data.csv', index=False)

In [29]:

fig = px.box(data,x="area",y="gdp_per_capita",points="all")
fig.show()

As the area increased, gdp did not kept up.

In [73]:

fig = px.bar(data, x='region', y='country')
fig.show()

Sub-Saharan Africa region has the most countries in it.

In [31]:
fig = px.bar(data, x='region', y='gdp_per_capita',color='country',title="GDP of multiple Regions",width=700,height=500)
fig.show()

Western Europe and Latin Amer. & Carib has highest GDP respectively, where as North Africa nad Balitics have least.

In [32]:

fig = px.scatter_3d(data, x='region', y='gdp_per_capita', z='net_migration',color='region',size="gdp_per_capita",height=700)
fig.show()

Migrants are dense towards Asia,North America and North Europe.

In [33]:
fig = px.scatter_3d(data, x='region', y='phones', z='literacy',
              color='region', size='population', size_max=18,
              symbol='region', opacity=0.7)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))

As literacy increased there is many points of phones, and asia has highest population also which makes belive us that chance of having phones are high here.

In [34]:
fig = px.bar(data, x="area", y="region", color="region", title="Area of each region",width=1000,height=500,orientation='h')
fig.show()

As expected Asia and Sub-saharan africa have highest area

# Extensive GDP Analysis ⚒️

Let's see the regional ranking according to the average gdp_per_capita.

In [35]:
d=data.groupby('region')['gdp_per_capita'].mean().sort_values()

fig = px.bar(d, x='gdp_per_capita',orientation='h',width=700,height=500)
fig.show()

### Observations:
- North America and Western Europe have highest average of gdp_per_capita
- Sub-Saharan Africa and C.W of Ind States have least; Which means large migration happened in the last decade.

In [36]:

fig = px.scatter(data, x="literacy", y="gdp_per_capita",title='GDP Analysis v/s Literacy', size='literacy', color="region",
           hover_name="country", log_x=True, size_max=60)
fig.show()

From this observation it became crystal clear that GDP of a country is highly dependant upon literacy and vice versa.

In [37]:
fig = px.scatter(data, x="arable", y="gdp_per_capita",color='area', title='GDP v/s Arable land analysis',marginal_x="histogram", marginal_y="rug")
fig.show()

We don't see strong relation between GDP and Arable land. So agriculture is not a strong factor anymore for economy of a country according to this graph.

In [38]:

fig = px.line(data, x='infant_mortality', y='gdp_per_capita',color='region',width=700,height=700,title='GDP v/s Infant Mortality Rate',symbol='region')
fig.show()

From the above graph we can observe that poor countries are suffering heavy loss of infants.

In [39]:
fig = px.scatter(data, x="agriculture", y="gdp_per_capita", color="region",
                 title='GDP v/s Agriculture (Crops)')
fig.show()

From the above it is shown as poor countries are more dependant upon harvesting crops than developed countris.

In [40]:
fig = px.scatter(data, x="industry", y="gdp_per_capita",title='GDP Analysis v/s Industry', size='literacy', color="region",
           hover_name="country", log_x=True, size_max=60)
fig.show()

We can not industry as our parameter because it is evenly distributed across all the countries

# Data Pre-Conditioning ⚖️

Preparing training data to feed the machine learning model.

We are going to perform the following:
1. Convert `region` column into numerical values.
2. Splitting the dataset into `train` and `test` in the ratio of 80 : 20
3. We will drop the column `countries` because it is in string.
4. We are using `gdp_per_capita` as labels.
5. We repeat the train test split with different ratios for better splitting with/without feature selection/scaling.

### Transforming region column

In [41]:
data_new = pd.concat([data,pd.get_dummies(data['region'], prefix='region')],axis=1).drop(['region'],axis=1)
print(data_new.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227 entries, 0 to 226
Data columns (total 30 columns):
 #   Column                                      Non-Null Count  Dtype   
---  ------                                      --------------  -----   
 0   country                                     227 non-null    category
 1   population                                  227 non-null    int64   
 2   area                                        227 non-null    int64   
 3   density                                     227 non-null    float64 
 4   coastline_area_ratio                        227 non-null    float64 
 5   net_migration                               227 non-null    float64 
 6   infant_mortality                            227 non-null    float64 
 7   gdp_per_capita                              227 non-null    float64 
 8   literacy                                    227 non-null    float64 
 9   phones                                      227 non-null    float64 
 10  ar

In [42]:
data_new.head()

Unnamed: 0,country,population,area,density,coastline_area_ratio,net_migration,infant_mortality,gdp_per_capita,literacy,phones,...,region_BALTICS,region_C.W. OF IND. STATES,region_EASTERN EUROPE,region_LATIN AMER. & CARIB,region_NEAR EAST,region_NORTHERN AFRICA,region_NORTHERN AMERICA,region_OCEANIA,region_SUB-SAHARAN AFRICA,region_WESTERN EUROPE
0,Afghanistan,31056997,647500,48.0,0.0,23.06,163.07,700.0,36.0,3.2,...,0,0,0,0,0,0,0,0,0,0
1,Albania,3581655,28748,124.6,1.26,-4.93,21.52,4500.0,86.5,71.2,...,0,0,1,0,0,0,0,0,0,0
2,Algeria,32930091,2381740,13.8,0.04,-0.39,31.0,6000.0,70.0,78.1,...,0,0,0,0,0,1,0,0,0,0
3,American Samoa,57794,199,290.4,58.29,-20.71,9.27,8000.0,97.0,259.5,...,0,0,0,0,0,0,0,1,0,0
4,Andorra,71201,468,152.1,0.0,6.6,4.05,19000.0,100.0,497.2,...,0,0,0,0,0,0,0,0,0,1


### Data Split 1: full data without Scaling

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [44]:
y=data_new['gdp_per_capita']
X=data_new.drop(['gdp_per_capita','country'],axis=1)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=3)


### Data Split 2: full data with Scaling

In [45]:
sc=StandardScaler()

X2_train = sc.fit_transform(X_train)
X2_test = sc.fit_transform(X_test)
y2_train = y_train
y2_test = y_test

### Data Split 3: selected features data without Scaling

> We select only some proportion of our features which have `correlation` score near to `+/- 0.3` with `gdp_per_capita`.

In [46]:
y3 = y
X3 = data_new.drop(['gdp_per_capita','country','population', 'area', 'coastline_area_ratio', 'arable',
                      'crops', 'other', 'climate', 'deathrate', 'industry'], axis=1)

X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=101)

In [47]:
X3.tail()


Unnamed: 0,density,net_migration,infant_mortality,literacy,phones,birthrate,agriculture,service,region_ASIA (EX. NEAR EAST),region_BALTICS,region_C.W. OF IND. STATES,region_EASTERN EUROPE,region_LATIN AMER. & CARIB,region_NEAR EAST,region_NORTHERN AFRICA,region_NORTHERN AMERICA,region_OCEANIA,region_SUB-SAHARAN AFRICA,region_WESTERN EUROPE
222,419.9,2.98,19.62,79.521429,145.2,31.67,0.09,0.63,0,0,0,0,0,1,0,0,0,0,0
223,1.0,0.0,0.0,67.24,100.2,20.814,0.17,0.4,0,0,0,0,0,0,1,0,0,0,0
224,40.6,0.0,61.5,50.2,37.2,42.89,0.135,0.393,0,0,0,0,0,1,0,0,0,0,0
225,15.3,0.0,88.29,80.6,8.2,41.0,0.22,0.489,0,0,0,0,0,0,0,0,0,1,0
226,31.3,0.0,67.69,90.7,26.8,28.01,0.179,0.579,0,0,0,0,0,0,0,0,0,1,0


### Data Split 4: selected features data with Scaling

In [48]:
X4_train = sc.fit_transform(X3_train)
X4_test = sc.fit_transform(X3_test)
y4_train = y3_train
y4_test = y3_test

# Linear Regression 📈

This is our first trial towards achieving better results with the supervised machine learning algorthms. As we observed before, some of the features in our dataset were not correlated at each other. Although we are going to test it.

Model training

In [49]:
from sklearn.linear_model import LinearRegression
linear_model_1 = LinearRegression()
linear_model_1.fit(X_train,y_train)

linear_model_2 = LinearRegression()
linear_model_2.fit(X2_train,y2_train)

linear_model_3 = LinearRegression()
linear_model_3.fit(X3_train,y3_train)

linear_model_4 = LinearRegression()
linear_model_4.fit(X4_train,y4_train)

LinearRegression()

The Predictions of all the instances

In [50]:
lm1_pred = linear_model_1.predict(X_test)
lm2_pred = linear_model_2.predict(X2_test)
lm3_pred = linear_model_3.predict(X3_test)
lm4_pred = linear_model_4.predict(X4_test)


Evaluation of the model's predictions

In [51]:
from sklearn import metrics

print('The Performance of the Linear Regression:')

print('\nAll features without Scaling:')
print('MeanAbsoluteError:', metrics.mean_absolute_error(y_test, lm1_pred))
print('MeanSquaredError:', np.sqrt(metrics.mean_squared_error(y_test, lm1_pred)))
print('R2_Score: ', metrics.r2_score(y_test, lm1_pred))

print('\nAll features with Scaling:')
print('MAE:', metrics.mean_absolute_error(y2_test, lm2_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y2_test, lm2_pred)))
print('R2_Score: ', metrics.r2_score(y2_test, lm2_pred))

print('\nOnly Selected features without Scaling:')
print('MAE:', metrics.mean_absolute_error(y3_test, lm3_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y3_test, lm3_pred)))
print('R2_Score: ', metrics.r2_score(y3_test, lm3_pred))

print('\nOnly Selected features  with Scaling:')
print('MAE:', metrics.mean_absolute_error(y4_test, lm4_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y4_test, lm4_pred)))
print('R2_Score: ', metrics.r2_score(y4_test, lm4_pred))

The Performance of the Linear Regression:

All features without Scaling:
MeanAbsoluteError: 4763.442677612238
MeanSquaredError: 7171.981229973934
R2_Score:  0.6167149158061069

All features with Scaling:
MAE: 4480.652667974766
RMSE: 6772.891385376827
R2_Score:  0.6581844214533712

Only Selected features without Scaling:
MAE: 2965.9357229398765
RMSE: 4088.794580247946
R2_Score:  0.7976685756859001

Only Selected features  with Scaling:
MAE: 2879.5213243944404
RMSE: 3756.4365885029674
R2_Score:  0.8292247702712089


In [52]:

fig = px.scatter( x=y4_test, y=lm4_pred,title="Test vs LR's Predictions")
fig.show()

This is a decent result from the Linear Regression with feature selection and scaling

# SVM (Support Vector Machine) 🩼

Model Training

In [53]:
from sklearn.svm import SVR
svm1 = SVR(kernel='rbf')
svm1.fit(X_train,y_train)

svm2 = SVR(kernel='rbf')
svm2.fit(X2_train,y2_train)

svm3 = SVR(kernel='rbf')
svm3.fit(X3_train,y3_train)

svm4 = SVR(kernel='rbf')
svm4.fit(X4_train,y4_train)

SVR()

Predictions

In [54]:
svm1_pred = svm1.predict(X_test)
svm2_pred = svm2.predict(X2_test)
svm3_pred = svm3.predict(X3_test)
svm4_pred = svm4.predict(X4_test)

Evaluation

In [55]:
print('SVM Performance:')

print('\nall features, No scaling:')
print('MAE:', metrics.mean_absolute_error(y_test, svm1_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, svm1_pred)))
print('R2_Score: ', metrics.r2_score(y_test, svm1_pred))

print('\nall features, with scaling:')
print('MAE:', metrics.mean_absolute_error(y2_test, svm2_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y2_test, svm2_pred)))
print('R2_Score: ', metrics.r2_score(y2_test, svm2_pred))

print('\nselected features, No scaling:')
print('MAE:', metrics.mean_absolute_error(y3_test, svm3_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y3_test, svm3_pred)))
print('R2_Score: ', metrics.r2_score(y3_test, svm3_pred))

print('\nselected features, with scaling:')
print('MAE:', metrics.mean_absolute_error(y4_test, svm4_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y4_test, svm4_pred)))
print('R2_Score: ', metrics.r2_score(y4_test, svm4_pred))

SVM Performance:

all features, No scaling:
MAE: 7804.451328630339
RMSE: 12596.919588768587
R2_Score:  -0.18242157190001818

all features, with scaling:
MAE: 7798.575089689044
RMSE: 12585.784083260287
R2_Score:  -0.1803320067216596

selected features, No scaling:
MAE: 7047.711927073501
RMSE: 9807.997922107874
R2_Score:  -0.16421578810668724

selected features, with scaling:
MAE: 7040.043820847137
RMSE: 9794.58886537642
R2_Score:  -0.1610346364957338


In [56]:
fig = px.scatter( x=y3_test, y=svm3_pred,title='Test vs SVM Prediction')
fig.show()

Feature scaling and selection did not help much for the prediction in our case, hence results of SVM is worse than LR. 

# Random Forest 🌲

We will feed our standard data which has been splitted, and will not going to perform any scaling or selection of features because these are not going to improve prediction by Random Forest

In [57]:
from sklearn.ensemble import RandomForestRegressor
rf1 = RandomForestRegressor(random_state=11, n_estimators=200)
rf3 = RandomForestRegressor(random_state=11, n_estimators=200)

rf1.fit(X_train, y_train)
rf3.fit(X3_train, y3_train)

RandomForestRegressor(n_estimators=200, random_state=11)

Prediction

In [58]:
rf1_pred = rf1.predict(X_test)
rf3_pred = rf3.predict(X3_test)

Evaluation

In [59]:
print('Random Forest Performance:')

print('\nall features, No scaling:')
print('MAE:', metrics.mean_absolute_error(y_test, rf1_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, rf1_pred)))
print('R2_Score: ', metrics.r2_score(y_test, rf1_pred))

print('\nselected features, No scaling:')
print('MAE:', metrics.mean_absolute_error(y3_test, rf3_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y3_test, rf3_pred)))
print('R2_Score: ', metrics.r2_score(y3_test, rf3_pred))

Random Forest Performance:

all features, No scaling:
MAE: 3500.663043478261
RMSE: 6009.861228322635
R2_Score:  0.7308635509536017

selected features, No scaling:
MAE: 2451.8804347826085
RMSE: 3580.5349437892173
R2_Score:  0.8448439955336524


In [60]:
fig = px.scatter( x=y_test, y=rf1_pred,title='Test vs RF Prediction')
fig.show()

This result is good, but we will try to improve its performance by following grid search method to get understanding of good parameter values that can improvise the model.

The Parameters we are going to test are,
- `n-estimators`: The number of trees in the forest. Default 100 or 10 in new versions. 
- `min_sample_leaf`: The minimum number of samples required to be at leaf node.
- `max_features`: The number of features that we are looking for the Best split
- `bootstrap`: While building a tree we give bootstrap samples or whole dataset for each tree.

In [61]:
rf_param_grid = {'max_features': ['sqrt', 'log2','float','auto'],
              'min_samples_leaf': [1, 3, 5],
              'n_estimators': [100, 500, 1000],
             'bootstrap': [False, True]} 

In [62]:
from sklearn.model_selection import GridSearchCV
rf_grid = GridSearchCV(estimator= RandomForestRegressor(), param_grid = rf_param_grid,  n_jobs=-1, verbose=0)

In [63]:
rf_grid.fit(X_train,y_train)



90 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Python310\lib\site-packages\sklearn\ensemble\_forest.py", line 450, in fit
    trees = Parallel(
  File "c:\Python310\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "c:\Python310\lib\site-packages\joblib\parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "c:\Python310\lib\site-packages\joblib\parallel.py", line 779, in _

GridSearchCV(estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [False, True],
                         'max_features': ['sqrt', 'log2', 'float', 'auto'],
                         'min_samples_leaf': [1, 3, 5],
                         'n_estimators': [100, 500, 1000]})

In [64]:
rf_grid.best_params_

{'bootstrap': True,
 'max_features': 'auto',
 'min_samples_leaf': 3,
 'n_estimators': 500}

In [65]:
print(rf_grid.best_estimator_)

RandomForestRegressor(min_samples_leaf=3, n_estimators=500)


In [66]:
rf_grid_predictions = rf_grid.predict(X_test)

In [67]:
print('MAE:', metrics.mean_absolute_error(y_test, rf_grid_predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, rf_grid_predictions)))
print('R2_Score: ', metrics.r2_score(y_test, rf_grid_predictions))

MAE: 3459.207754671444
RMSE: 5905.843800081827
R2_Score:  0.7400992439021956


In [68]:
fig = px.scatter( x=y_test, y=rf_grid_predictions,title='Test vs RF GridSearched Prediction')
fig.show()

In [69]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y3_test, rf3_pred))

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

We can't see a significant improvement over initial parameter. Probably initial values are only optimum.

## Saving the model using Pickle

In [None]:
import  pickle
pickle.dump(rf3,open('RFmodel.pkl','wb'))

# Gradient Boosting 🌈

Training with default parameters

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbm1 = GradientBoostingRegressor(learning_rate=0.1, n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_depth=3,
                                 subsample=1.0, max_features= None, random_state=101)
gbm3 = GradientBoostingRegressor(learning_rate=0.1, n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_depth=3,
                                 subsample=1.0, max_features= None, random_state=101)

gbm1.fit(X_train, y_train)
gbm3.fit(X3_train, y3_train)

GradientBoostingRegressor(random_state=101)

Prediction

In [None]:
gbm1_pred = gbm1.predict(X_test)
gbm3_pred = gbm3.predict(X3_test)

Evaluation

In [None]:
print('Gradiant Boosting Performance:')

print('\nall features, No scaling:')
print('MAE:', metrics.mean_absolute_error(y_test, gbm1_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, gbm1_pred)))
print('R2_Score: ', metrics.r2_score(y_test, gbm1_pred))

print('\nselected features, No scaling:')
print('MAE:', metrics.mean_absolute_error(y3_test, gbm3_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y3_test, gbm3_pred)))
print('R2_Score: ', metrics.r2_score(y3_test, gbm3_pred))

Gradiant Boosting Performance:

all features, No scaling:
MAE: 3534.5813858981837
RMSE: 5928.211514861006
R2_Score:  0.7381268264285601

selected features, No scaling:
MAE: 2467.2081266874507
RMSE: 3789.2979753946875
R2_Score:  0.8262238105475073


In [None]:
fig = px.scatter( x=y_test, y=gbm1_pred,title='Test vs GBR Prediction')
fig.show()

Gradient Boosting gave us pretty good performance overall, that too without the need of optimisation!
> Although RandomForest and GradientBoosting are comparable with respect to our same dataset.

Knowing the importance across all the Features.

In [None]:
imp_features = pd.Series(gbm1.feature_importances_, list(X_train)).sort_values(ascending=False)


In [None]:
fig = px.bar(imp_features, title="Importance of Features",color='value',height=700,width=700,orientation='v')
fig.show()

### Observations:👀
- Number of `phones` seems to having more power for prediction
- Rest all the features are none the less similar contribution for the performance of this model.
- We achieved `R2_Score` of 0.82 from `GBR` and 0.73 from `RandomForest` which means GBR can be better over RF
- We will give a shot to even optimise this model (GBR)

## Optimisation of GBM 🚀

We are once again going to use GridSearch method to pick better parameters for our regression model. GB is very robust towards handling over-fitting, so large data results in better output accuracy.

The parameters we are using for optimisation are,
1. `n-estimators`: The number of boosting stages to perform.
2. `learning_rate`: It is the shrinking of contribution of each tree
3. `max_depth`: Maximum depth of individual regression estimators (nodes)
4. `subsample`: The fraction of samples to be used for fitting the individual base learners. A subsample = 0.5 means that 50% of training data is used prior to growing a tree.
5. `min_sample_leaf`: The minimum number of samples required to consider a leaf node
6. `min_sample_split`: The minimum number required to split an internal node
7. `max_features`: Maximum number of features to consider while looking for the best fit.

In [None]:
gbm_param_grid = {'learning_rate':[1,0.1, 0.01, 0.001], 
           'n_estimators':[100, 500, 1000],
          'max_depth':[3, 5, 8],
          'subsample':[0.7, 1], 
          'min_samples_leaf':[1, 20],
          'min_samples_split':[10, 20],
          'max_features':[4, 7]}

gbm_tuning = GridSearchCV(estimator =GradientBoostingRegressor(random_state=11),
                          param_grid = gbm_param_grid,
                          n_jobs=-1,
                          cv=5)

gbm_tuning.fit(X_train,y_train)
print(gbm_tuning.best_params_)

{'learning_rate': 0.1, 'max_depth': 3, 'max_features': 4, 'min_samples_leaf': 1, 'min_samples_split': 20, 'n_estimators': 500, 'subsample': 1}


In [None]:
gbm_grid_predictions = gbm_tuning.predict(X_test)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, gbm_grid_predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, gbm_grid_predictions)))
print('R2_Score: ', metrics.r2_score(y_test, gbm_grid_predictions))

MAE: 3785.8389089029693
RMSE: 6404.934858617812
R2_Score:  0.6943157489140775


In [None]:
fig = px.scatter( x=y_test, y=gbm_grid_predictions,title='Test vs Optimised GBR Prediction')
fig.show()

Results are not up to the mark interestingly!

In [None]:
gbm_opt = GradientBoostingRegressor(learning_rate=0.01, n_estimators=500,max_depth=5, min_samples_split=10, min_samples_leaf=1, 
                                    subsample=0.7,max_features=7, random_state=101)
gbm_opt.fit(X_train,y_train)
feat_imp2 = pd.Series(gbm_opt.feature_importances_, list(X_train)).sort_values(ascending=False)

fig = px.bar(feat_imp2, title="Importance of Features",color='value',height=700,width=700,orientation='v')
fig.show()

### Observations: 👀
- Optimisation caused decrease in performance of our model surprisingly
- This happened because the limitaions of processing of GridSearch
- But this resulted in a difference in importance of features
- We can consider the performance of both RandomForest and GeadientBoosting are quite same.

# Performance Awards! 🏆

1. _`Random Forest` with Feature selection and NO scaling_
- Mean Absolute Error __(MAE)__: 2451.88
- Root Mean Squared Error __(RMSE)__: 3580.53
- R-Squared Score __(R2_Score)__: 0.84

2. _`Gradient Boosting` with selected features and scaling_
- Mean Absolute Error __(MAE)__: 2467.21
- Root Mean Squared Error __(RMSE)__: 3789.30
- R-Squared Score __(R2_Score)__: 0.83

3. _`Linear Regression` with selected features and scaling_
- Mean Absolute Error __(MAE)__: 2879.521
- Root Mean Squared Error __(RMSE)__:3756.43
- R-Squared Score __(R2_Score)__: 0.83

4. _`Optimised Random Forest`_
- Mean Absolute Error __(MAE)__: 3564.04
- Root Mean Squared Error __(RMSE)__: 5915.82
- R-Squared Score __(R2_Score)__: 0.73

5. _`SVM` with feature scaling and selection_
- Mean Absolute Error __(MAE)__: 7040.04
- Root Mean Squared Error __(RMSE)__: 9794.59
- R-Squared Score __(R2_Score)__: -0.16