In [52]:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.formula.api import ols
import statsmodels.api as sm
import statsmodels.stats.multicomp as mc
from statsmodels.multivariate.manova import MANOVA

In [53]:
# Step -1     == Read File.

df=pd.read_csv('StudentsPerformance.csv')
df.head()
df[index="gnder"]

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [54]:
# Step -2     == In this step we have renamed/removed spaces from scores. 

df.columns=['gender', 'race/ethnicity', 'parentallevelofeducation', 'lunch',
       'testpreparationcourse', 'mathscore', 'readingscore',
       'writingscore']

In [78]:
# Step -3     == Here we have taken all score as dependent variable and parent level of education as independent variable
#                we are seeing that does student scores are affected by parent's education or not.

maov=MANOVA.from_formula('mathscore+readingscore+writingscore~parentallevelofeducation+gender+lunch+testpreparationcourse',data=df)
print(maov.mv_test())


                    Multivariate linear model
                                                                 
-----------------------------------------------------------------
         Intercept        Value  Num DF  Den DF   F Value  Pr > F
-----------------------------------------------------------------
            Wilks' lambda 0.1449 3.0000 989.0000 1946.0521 0.0000
           Pillai's trace 0.8551 3.0000 989.0000 1946.0521 0.0000
   Hotelling-Lawley trace 5.9031 3.0000 989.0000 1946.0521 0.0000
      Roy's greatest root 5.9031 3.0000 989.0000 1946.0521 0.0000
-----------------------------------------------------------------
                                                                 
-----------------------------------------------------------------
 parentallevelofeducation Value   Num DF   Den DF  F Value Pr > F
-----------------------------------------------------------------
            Wilks' lambda 0.8777 15.0000 2730.5949  8.8111 0.0000
           Pillai's trace 0.12

In [79]:
# Step - 4     == In step3 we get to know that scores are been affected by all the independent variable
#                 So, now we are using anova to check it individual scores with all the independent variable
#                 this step was done to eliminate the not affected score but here all p-value<0.05 
#                 which means all are been affected, So we have taken the most affect score.

# For Maths
k=['parentallevelofeducation','gender','lunch','testpreparationcourse']
for i in k:
    a1 = ols(f'mathscore~{i}',data=df).fit()
    a2 = sm.stats.anova_lm(a1,typ=1)
    print(i)
    print(a2)

parentallevelofeducation
                             df         sum_sq      mean_sq         F  \
parentallevelofeducation    5.0    7295.561831  1459.112366  6.521583   
Residual                  994.0  222393.517169   223.735933       NaN   

                            PR(>F)  
parentallevelofeducation  0.000006  
Residual                       NaN  
gender
             df         sum_sq      mean_sq          F        PR(>F)
gender      1.0    6481.373854  6481.373854  28.979336  9.120186e-08
Residual  998.0  223207.705146   223.655015        NaN           NaN
lunch
             df         sum_sq       mean_sq           F        PR(>F)
lunch       1.0   28278.037838  28278.037838  140.118842  2.413196e-30
Residual  998.0  201411.041162    201.814671         NaN           NaN
testpreparationcourse
                          df         sum_sq      mean_sq          F  \
testpreparationcourse    1.0    7253.160232  7253.160232  32.542648   
Residual               998.0  222435.918768   2

In [80]:
# reading

k=['parentallevelofeducation','gender','lunch','testpreparationcourse']
for i in k:
    a1 = ols(f'readingscore~{i}',data=df).fit()
    a2 = sm.stats.anova_lm(a1,typ=1)
    print(i)
    print(a2)

parentallevelofeducation
                             df         sum_sq      mean_sq       F  \
parentallevelofeducation    5.0    9506.493193  1901.298639  9.2894   
Residual                  994.0  203445.945807   204.673990     NaN   

                                PR(>F)  
parentallevelofeducation  1.168246e-08  
Residual                           NaN  
gender
             df         sum_sq       mean_sq          F        PR(>F)
gender      1.0   12710.843676  12710.843676  63.350584  4.680539e-15
Residual  998.0  200241.595324    200.642881        NaN           NaN
lunch
             df         sum_sq       mean_sq          F        PR(>F)
lunch       1.0   11222.155126  11222.155126  55.518242  2.002797e-13
Residual  998.0  201730.283874    202.134553        NaN           NaN
testpreparationcourse
                          df         sum_sq       mean_sq          F  \
testpreparationcourse    1.0   12448.726414  12448.726414  61.963087   
Residual               998.0  200503.71

In [81]:
# Writing

k=['parentallevelofeducation','gender','lunch','testpreparationcourse']
for i in k:
    a1 = ols(f'writingscore~{i}',data=df).fit()
    a2 = sm.stats.anova_lm(a1,typ=1)
    print(i)
    print(a2)

parentallevelofeducation
                             df         sum_sq     mean_sq          F  \
parentallevelofeducation    5.0   15623.225898  3124.64518  14.442416   
Residual                  994.0  215053.858102   216.35197        NaN   

                                PR(>F)  
parentallevelofeducation  1.120280e-13  
Residual                           NaN  
gender
             df         sum_sq       mean_sq          F        PR(>F)
gender      1.0   20930.822413  20930.822413  99.591576  2.019878e-22
Residual  998.0  209746.261587    210.166595        NaN           NaN
lunch
             df         sum_sq       mean_sq          F        PR(>F)
lunch       1.0   13933.413119  13933.413119  64.156643  3.186190e-15
Residual  998.0  216743.670881    217.178027        NaN           NaN
testpreparationcourse
                          df         sum_sq       mean_sq           F  \
testpreparationcourse    1.0   22591.447181  22591.447181  108.350892   
Residual               998.0  2

In [82]:
# Step - 5     == Now we have changed the data type of independent variable from string to integer.
#                  Now we have choosen lunch as it's p-value is most < than 0.05

df['lunch']=df['lunch'].map({"standard":0,'free/reduced':1})


In [60]:
# Step - 6     == Here we have used multicomparison to know all combination of unique values present in lunch
#                 which affect mathsscores

p=mc.MultiComparison(df['mathscore'],groups=df['lunch'])
p1=p.tukeyhsd()
p2=p1.summary()
p2

group1,group2,meandiff,p-adj,lower,upper,reject
0,1,-11.113,-0.0,-12.9553,-9.2707,True


In [76]:
A=df[(df['lunch']==0)]
B=df[(df['lunch']==1)]
a=list(A["mathscore"].values)
b=list(B["mathscore"].values)

In [77]:
stats.ttest_ind(a,b,equal_var=True,alternative="greater")

Ttest_indResult(statistic=11.837180472914612, pvalue=1.2065977996568537e-30)

Now we will be doing linear regression, basically we will be considering the columns 'gender', 'race/ethnicity', 
'parentallevelofeducation', 'lunch', and 'testpreparationcourse' as independent variables and predicting either 
'mathscore', 'readingscore', or 'writingscore' as the dependent variable.

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder


In [15]:
# Step - 1     == Reding file and renamed/removed spaces from scores 
df=pd.read_csv('StudentsPerformance.csv')
df.columns=['gender', 'race/ethnicity', 'parentallevelofeducation', 'lunch',
            'testpreparationcourse', 'mathscore', 'readingscore',
       'writingscore']
df

Unnamed: 0,gender,race/ethnicity,parentallevelofeducation,lunch,testpreparationcourse,mathscore,readingscore,writingscore
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [16]:
# Step - 2     == Encode categorical variables (Converting Categorical variable into numerical values)

label_encoder = LabelEncoder()
df['gender'] = label_encoder.fit_transform(df['gender'])
df['race/ethnicity'] = label_encoder.fit_transform(df['race/ethnicity'])
df['parentallevelofeducation'] = label_encoder.fit_transform(df['parentallevelofeducation'])
df['lunch'] = label_encoder.fit_transform(df['lunch'])
df['testpreparationcourse'] = label_encoder.fit_transform(df['testpreparationcourse'])
df

Unnamed: 0,gender,race/ethnicity,parentallevelofeducation,lunch,testpreparationcourse,mathscore,readingscore,writingscore
0,0,1,1,1,1,72,72,74
1,0,2,4,1,0,69,90,88
2,0,1,3,1,1,90,95,93
3,1,0,0,0,1,47,57,44
4,1,2,4,1,1,76,78,75
...,...,...,...,...,...,...,...,...
995,0,4,3,1,0,88,99,95
996,1,2,2,0,1,62,55,55
997,0,2,2,0,0,59,71,65
998,0,3,4,1,0,68,78,77


In [17]:
# Step - 3     == Split the data into independent variables (X) and dependent variable (y), but here we are using mathscore

X = df[['gender', 'race/ethnicity', 'parentallevelofeducation', 'lunch', 'testpreparationcourse']]
y = df['mathscore']  

In [18]:
# Step - 4     == Split the data into training and testing sets, here independent variables (X_train, X_test) and the dependent 
#                 variable (y_train, y_test). The training sets (X_train, y_train) will be used to train the model, while the
#                 testing sets (X_test, y_test) will be used to evaluate the model's performance.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)


In [19]:
# Step - 5     == Create and train the Linear Regression model and after this we have done prediction on the test data.

model = LinearRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
print(predictions)

[69.92072498 65.69892376 67.03052738 55.86108003 57.31691432 73.55448841
 63.75781137 60.46539965 71.77008998 60.46539965 64.72836756 72.70816289
 66.16273087 63.75781137 78.67656942 54.52947641 69.92072498 52.609835
 69.92072498 68.87989238 68.95016878 65.12367355 63.77059895 59.51631444
 52.1245569  68.00108357 68.00108357 62.82151374 76.01336219 61.94092965
 64.15311736 56.76135982 63.73811566 65.21364566 78.315522   56.27608172
 73.55448841 71.30628287 68.84740909 60.13683552 73.06921031 69.36517048
 68.58912136 61.10739171 54.52947641 69.36517048 65.57469308 65.57469308
 55.27304224 73.22592428 65.69892376 60.39512325 58.66821364 63.39676395
 52.48560433 61.46843914 55.79080362 65.69892376 73.19344099 70.7670505
 53.07364212 68.00108357 59.98012156 67.52859306 75.52808409 48.3665628
 56.83163622 55.86108003 65.31640535 67.61856517 52.609835   68.87989238
 76.49864029 71.77008998 65.73140705 60.10435223 69.33268719 65.69892376
 64.72836756 72.22288479 68.46489069 57.21415463 58.578

In [20]:
# Step - 6

#          Trying 
from sklearn.metrics import accuracy_score,r2_score
print(r2_score(y_test,predictions))
mse = np.mean((predictions-y_test)**2) #mean_square_error
mse


0.2526256401256477


154.8768751636834