# Name: Vineet Poojary 
# Roll no. 23
# ISUP Case Study

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Dataset 1 - Country

In [2]:
df = pd.read_csv(r'C:\css\country.csv')
df

Unnamed: 0,Country,Corruption_Index,Gini_Index
0,Hong Kong,77,53.7
1,South Korea,53,30.2
2,China,40,46.2
3,Italy,47,32.7
4,Mongolia,38,36.5
5,Austria,75,27.6
6,Norway,85,23.5
7,UK,81,31.6
8,Canada,82,33.7
9,Germany,81,30.7


In [3]:
X = df['Gini_Index']
Y = df['Corruption_Index']

# Question 1: Develop a simple linear regression model between corruption perception index (Y) and Gini Index (X). What is the change in the corruption perception index for every one unit increase in Gini Index ?

In [4]:
X = sm.add_constant(X) # Adding constant to fit the model ahead.

In [5]:
model = sm.OLS(Y, X).fit()
print(model.params)   

# Fitting the model here.
# by printing this we get a0 = 106.695011 and a1 = -1.295193

const         106.695011
Gini_Index     -1.295193
dtype: float64


In [6]:
# To specifically get change in the corruption perception index for every one unit increase in Gini Index we can do is ..

change = model.params['Gini_Index']
print("Change in the corruption perception index for every one unit increase in Gini_Index would be :", change)

Change in the corruption perception index for every one unit increase in Gini_Index would be : -1.2951931651912754


# Question 2: What proportion of the variation in corruption perception index is explained by Gini Index ?

In [7]:
rsq = model.rsquared
print("Proportion of the variation in corruption perception index explained by Gini_Index would be :", rsq)

# Here rsquared is taken since by definition R-Squared is a statistical measure that determines the proportion
# of variance in the dependent variable here (corruption_index) that can be explained by the independent variable
# here (Gini_Index)

Proportion of the variation in corruption perception index explained by Gini_Index would be : 0.21521228052302832


# Question 3: Is there a statistically significant relationship between corruption perception index and Gini index at the alpha 0.1 % ?


In [8]:
alpha = 0.001  # at alpha 0.1

In [9]:
p_value = model.pvalues['Gini_Index']
p_value

0.03936250788718881

In [10]:
if p_value < alpha:
    print("There is a statistically significant relationship between corruption perception index and Gini_index at alpha 0.1%")
else:
    print("There is no statistically significant relationship between corruption perception index and Gini_index at alpha 0.1%")


There is no statistically significant relationship between corruption perception index and Gini_index at alpha 0.1%


# Question 4: Calculate the 95% confidence interval for the regression coefficient a1.

In [11]:
conf = model.conf_int(alpha=0.05).loc['Gini_Index']
print("95% Confidence Interval for the regression coefficient a1 would be :", conf)

95% Confidence Interval for the regression coefficient a1 would be : 0   -2.519953
1   -0.070434
Name: Gini_Index, dtype: float64


# Dataset 2 - Houseprices

In [12]:
import pandas as pd
import statsmodels.api as sm

In [13]:
df2 = pd.read_csv(r'C:\css\homeprices.csv')
df2

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4400,5.0,8,795000


# Creating Multiple Linear Regression Model for the house prices data set

In [14]:
# Now fitting won't happen until the nan value in the bedrooms column is resolved which can be done by replacing the nan
# value by the mean value of that column.

mean = df2['bedrooms'].mean()
df2['bedrooms'].fillna(mean, inplace=True)
df2

Unnamed: 0,area,bedrooms,age,price
0,2600,3.0,20,550000
1,3000,4.0,15,565000
2,3200,4.0,18,610000
3,3600,3.0,30,595000
4,4000,5.0,8,760000
5,4400,5.0,8,795000


In [15]:
X = df2[['area', 'bedrooms']]
y = df2['price']

In [16]:
X = sm.add_constant(X)

In [17]:
model = sm.OLS(y, X).fit()
print(model.params)

const       111901.993355
area            97.840532
bedrooms     48687.707641
dtype: float64


In [18]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.941
Model:                            OLS   Adj. R-squared:                  0.902
Method:                 Least Squares   F-statistic:                     23.89
Date:                Mon, 25 Mar 2024   Prob (F-statistic):             0.0144
Time:                        01:18:33   Log-Likelihood:                -68.836
No. Observations:                   6   AIC:                             143.7
Df Residuals:                       3   BIC:                             143.0
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1.119e+05   7.89e+04      1.418      0.2

  warn("omni_normtest is not valid with less than 8 observations; %i "


# 1. Predict the house price with 780 sqft area, 3 bed rooms.


In [19]:
data = [[780, 3], [1500, 3], [2000, 4]]

In [20]:
data = sm.add_constant(data)

In [21]:
pred = model.predict(data)

In [22]:
print(f"1. House with 780 sqft, 3 bedrooms would be around: ${pred[0]:.2f}")

1. House with 780 sqft, 3 bedrooms would be around: $334280.73


# 2. Predict the house prices for two houses with 1500 sqft, 3 bed rooms and another with 2000 sqft, 4 bed rooms.



In [23]:
print(f"2. House with 1500 sqft, 3 bedrooms would be around: ${pred[1]:.2f}")
print(f"3. House with 2000 sqft, 4 bedrooms would be around: ${pred[2]:.2f}")

2. House with 1500 sqft, 3 bedrooms would be around: $404725.91
3. House with 2000 sqft, 4 bedrooms would be around: $502333.89


# Assignment End