# chi square distribution

- 1. goodness of fit - one variable
- 2. degree of association - two variables

In [1]:
import numpy as np
import pandas as pd
import scipy
from scipy import stats
from scipy.stats import chisquare

In [2]:
pattern = np.array([0.6,0.25,0.1,0.05])

In [3]:
sum(pattern)

1.0

In [4]:
obs_counts = np.array([250,180,120,40])

In [5]:
exp_counts = 590*pattern

In [6]:
exp_counts

array([354. , 147.5,  59. ,  29.5])

In [7]:
scipy.stats.chisquare(obs_counts,exp_counts)

Power_divergenceResult(statistic=104.51977401129943, pvalue=1.657508756495914e-22)

In [9]:
d = obs_counts - exp_counts
d

array([-104. ,   32.5,   61. ,   10.5])

In [10]:
c = d**2 / exp_counts
c

array([30.55367232,  7.16101695, 63.06779661,  3.73728814])

In [11]:
sum(c)

104.51977401129943

# Degree of association - 2 variables


In [12]:
a1 = np.array([[45,27,21],[31,28,27]])
a1

array([[45, 27, 21],
       [31, 28, 27]])

Ho - whether you get the job or not does not depend on the qualifications

In [14]:
from scipy.stats import chi2_contingency

In [15]:
scipy.stats.chi2_contingency(a1)

Chi2ContingencyResult(statistic=3.0780934692405153, pvalue=0.2145855609456467, dof=2, expected_freq=array([[39.48603352, 28.57541899, 24.93854749],
       [36.51396648, 26.42458101, 23.06145251]]))

# Supervised Learning

### 1. Simple linear regression

- response - contineous
- single predictor - continuos
- y = mx + c [simple linear regression]
- y = m1x1 + m2x2 + c [multiple linear regression]

corelation co-efficient gives the information about the two continuos variables.
- 1. sign of number (-,+) - sign of the corelation co-efficient tells us the type of relation
positive relationship - 1st value increases then 2nd value increases
negative relationship - increasing value of x, value of y is decreasing
- 2. magnitude of number 
magnitude tells you the strength of the relationship.
closest to 1, stronger is the relationship between 2 variables.

cc denoted by r : r between A and B found to be 0.87 and A and C is -0.96(closest to -1(strong -ve relationship so I will choose c (ignore the sign of calculate)))

cc A and B 0.1 ()

It ranges from -1 to 1.
It cannot exceed its range, not even -1.1 or 1.1


# Simple Linear Regression

In [155]:
import numpy as np
import pandas as pd
import statsmodels
import sklearn
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.stats.multicomp
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [156]:
df = pd.read_excel('CDAC_DataBook.xlsx', sheet_name='faithful')

In [157]:
df.head()

Unnamed: 0,eruptions,waiting
0,3.6,79
1,1.8,54
2,3.333,74
3,2.283,62
4,4.533,85


In [158]:
x_train,x_test,y_train,y_test = train_test_split(df['waiting'],df['eruptions'],test_size = 0.25)

In [159]:
x_train.head()

37     80
191    57
173    68
238    79
121    69
Name: waiting, dtype: int64

In [160]:
y_train.head()

37     4.833
191    1.833
173    3.333
238    3.950
121    4.067
Name: eruptions, dtype: float64

In [161]:
# to add constant term to the predictor column
x_train = sm.add_constant(x_train,prepend = False)

In [162]:
mod1 = sm.OLS(y_train,x_train).fit()

In [163]:
print(mod1.summary())

                            OLS Regression Results                            
Dep. Variable:              eruptions   R-squared:                       0.814
Model:                            OLS   Adj. R-squared:                  0.813
Method:                 Least Squares   F-statistic:                     885.0
Date:                Fri, 23 Jun 2023   Prob (F-statistic):           9.41e-76
Time:                        15:12:29   Log-Likelihood:                -147.25
No. Observations:                 204   AIC:                             298.5
Df Residuals:                     202   BIC:                             305.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
waiting        0.0773      0.003     29.749      0.0

# R-squared

- defines how much data is in correct form.
- If it is closer to the 1 then, data is in correct form.

In [111]:
x_test = sm.add_constant(x_test,prepend = False) # actual output

In [112]:
y_pred = mod1.predict(x_test) #predicted output

In [113]:
y_pred[:5] #prediction

62    1.737247
53    4.170883
24    3.714576
2     3.714576
35    2.041452
dtype: float64

In [114]:
y_test[:5] #actual output

62    1.750
53    4.833
24    4.533
2     3.333
35    2.017
Name: eruptions, dtype: float64

# Multiple Linear Regression

In [115]:
df = pd.read_excel('CDAC_DataBook.xlsx',sheet_name = 'stackloss')

In [116]:
df.head()

Unnamed: 0,AirFlow,WaterTemp,AcidConc,StackLoss
0,80,27,89,42
1,80,27,88,37
2,75,25,90,37
3,62,24,87,28
4,62,22,87,18


In [117]:
x_train,x_test,y_train,y_test = train_test_split(df.drop('StackLoss',axis=1),df['StackLoss'],test_size=0.25)

In [118]:
x_train.head()

Unnamed: 0,AirFlow,WaterTemp,AcidConc
8,58,23,87
20,70,20,91
19,56,20,82
9,58,18,80
11,58,17,88


In [119]:
y_train.head()

8     15
20    15
19    15
9     14
11    13
Name: StackLoss, dtype: int64

In [120]:
x_train = sm.add_constant(x_train,prepend=False)

In [121]:
mod1 = sm.OLS(y_train,x_train).fit()

In [122]:
print(mod1.summary())

                            OLS Regression Results                            
Dep. Variable:              StackLoss   R-squared:                       0.866
Model:                            OLS   Adj. R-squared:                  0.830
Method:                 Least Squares   F-statistic:                     23.75
Date:                Thu, 22 Jun 2023   Prob (F-statistic):           4.16e-05
Time:                        15:28:48   Log-Likelihood:                -38.805
No. Observations:                  15   AIC:                             85.61
Df Residuals:                      11   BIC:                             88.44
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
AirFlow        0.6423      0.177      3.637      0.0



In [123]:
x_test = sm.add_constant(x_test,prepend = False) # actual output

In [124]:
y_pred = mod1.predict(x_test) #predicted output

In [125]:
y_pred[:5] #prediction

15     6.811749
0     37.527211
13    11.704680
4     19.682568
16    11.345542
dtype: float64

In [126]:
y_test[:5] #actual output

15     7
0     42
13    12
4     18
16     8
Name: StackLoss, dtype: int64

# Categorical Regression

In [127]:
df = pd.read_excel('CDAC_DataBook.xlsx',sheet_name = 'salaries')

In [128]:
df.head()

Unnamed: 0,rank,discipline,yrs_phd,yrs_service,gender,salary
0,Prof,B,19,18,Male,139750
1,Prof,B,20,16,Male,173200
2,AsstProf,B,4,3,Male,79750
3,Prof,B,45,39,Male,115000
4,Prof,B,40,41,Male,141500


In [129]:
df = df[['rank','yrs_service','salary']]

In [130]:
rank_dummy = pd.get_dummies(df['rank'],drop_first = True)

In [143]:
rank_dummy.head(10)

Unnamed: 0,AsstProf,Prof
0,0,1
1,0,1
2,1,0
3,0,1
4,0,1
5,0,0
6,0,1
7,0,1
8,0,1
9,0,1


In [132]:
df = df.drop('rank',axis=1)

In [133]:
df = pd.concat([df,rank_dummy],axis=1)

In [134]:
df.head()

Unnamed: 0,yrs_service,salary,AsstProf,Prof
0,18,139750,0,1
1,16,173200,0,1
2,3,79750,1,0
3,39,115000,0,1
4,41,141500,0,1


In [135]:
x_train,x_test,y_train,y_test = train_test_split(df.drop('salary',axis=1),df['salary'],test_size=0.25)

In [136]:
x_train = sm.add_constant(x_train,prepend=False)

In [137]:
mod1 = sm.OLS(y_train,x_train).fit()

In [138]:
print(mod1.summary())

                            OLS Regression Results                            
Dep. Variable:                 salary   R-squared:                       0.404
Model:                            OLS   Adj. R-squared:                  0.398
Method:                 Least Squares   F-statistic:                     66.15
Date:                Thu, 22 Jun 2023   Prob (F-statistic):           1.09e-32
Time:                        15:32:13   Log-Likelihood:                -3409.7
No. Observations:                 297   AIC:                             6827.
Df Residuals:                     293   BIC:                             6842.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
yrs_service  -173.9735    135.849     -1.281      

In [139]:
x_test = sm.add_constant(x_test,prepend = False) # actual output

In [140]:
y_pred = mod1.predict(x_test) #predicted output

In [141]:
y_pred[:5] #prediction

268    123052.088277
76     130185.002702
86     124269.902935
358    128271.293954
335    124095.929413
dtype: float64

In [142]:
y_test[:5] #actual output

268     89650
76     150480
86     152708
358    109954
335    151445
Name: salary, dtype: int64