In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("data.csv")

In [3]:
# rows and column
data.shape

(1000, 8)

In [4]:
# data types
data.dtypes

gender                         object
race_ethnicity                 object
parental_level_of_education    object
lunch                          object
test_preparation_course        object
math_score                      int64
reading_score                   int64
writing_score                   int64
dtype: object

In [5]:
# data types
data.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [6]:
# data types for only numeric values
data.describe()

Unnamed: 0,math_score,reading_score,writing_score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [7]:
# data types for only non numeric data
data.describe(include="object")

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course
count,1000,1000,1000,1000,1000
unique,2,5,6,2,2
top,female,group C,some college,standard,none
freq,518,319,226,645,642


In [None]:
# descriptive stats

In [10]:
# column name
data.columns

Index(['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course', 'math_score', 'reading_score',
       'writing_score'],
      dtype='object')

In [11]:
# Index
data.index

RangeIndex(start=0, stop=1000, step=1)

In [12]:
# complete information of the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race_ethnicity               1000 non-null   object
 2   parental_level_of_education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test_preparation_course      1000 non-null   object
 5   math_score                   1000 non-null   int64 
 6   reading_score                1000 non-null   int64 
 7   writing_score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [17]:
# Now explain if there are some issues in data to be resolved or all is ok.

In [13]:
data.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [14]:
# kitne females hai jinka talaauk "group A" se hai and un k parent k pass "master's degree" hia
data[(data.gender == "female") & (data.race_ethnicity == "group B") & (data.parental_level_of_education == "master's degree")]

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
2,female,group B,master's degree,standard,none,90,95,93
167,female,group B,master's degree,free/reduced,completed,58,76,78
178,female,group B,master's degree,free/reduced,completed,52,70,62
514,female,group B,master's degree,free/reduced,completed,77,97,94
781,female,group B,master's degree,standard,none,77,90,84


In [15]:
data[(data.gender == "female") & (data.race_ethnicity == "group B") & (data.parental_level_of_education == "master's degree")].count()

gender                         5
race_ethnicity                 5
parental_level_of_education    5
lunch                          5
test_preparation_course        5
math_score                     5
reading_score                  5
writing_score                  5
dtype: int64

In [16]:
data.gender.unique()

array(['female', 'male'], dtype=object)

In [17]:
data.parental_level_of_education.unique()

array(["bachelor's degree", 'some college', "master's degree",
       "associate's degree", 'high school', 'some high school'],
      dtype=object)

In [18]:
data.race_ethnicity.unique()

array(['group B', 'group C', 'group A', 'group D', 'group E'],
      dtype=object)

In [19]:
# kitne overall students aese hai jinka lunch standard hai and unho ne mathematics main score 90+ kia hai
data.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [20]:
data[(data.lunch == "standard") & (data.math_score >= 90)]

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
2,female,group B,master's degree,standard,none,90,95,93
34,male,group E,some college,standard,none,97,87,82
104,male,group C,some college,standard,completed,98,86,90
114,female,group E,bachelor's degree,standard,completed,99,100,100
121,male,group B,associate's degree,standard,completed,91,89,92
165,female,group C,bachelor's degree,standard,completed,96,100,100
171,male,group E,some high school,standard,none,94,88,78
179,female,group D,some high school,standard,completed,97,100,100
233,male,group E,some high school,standard,none,92,87,78
263,female,group E,high school,standard,none,99,93,90


In [21]:
data[(data.lunch == "standard") & (data.math_score >= 90)].count()

gender                         51
race_ethnicity                 51
parental_level_of_education    51
lunch                          51
test_preparation_course        51
math_score                     51
reading_score                  51
writing_score                  51
dtype: int64

In [22]:
data.lunch.unique()

array(['standard', 'free/reduced'], dtype=object)

In [23]:
# 
data.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [24]:
data[data.gender == "female"]["math_score"].mean()

63.633204633204635

In [25]:
data[data.gender == "male"]["math_score"].mean()

68.72821576763485

In [26]:
data[data.gender == "female"]["math_score"].mode()

0    65
Name: math_score, dtype: int64

In [27]:
data[data.gender == "male"]["math_score"].mode()

0    62
Name: math_score, dtype: int64

In [28]:
data.groupby('gender')['math_score'].agg('mean')

gender
female    63.633205
male      68.728216
Name: math_score, dtype: float64

In [29]:
data.groupby('gender')['math_score'].agg(['mean','max', 'min'])

Unnamed: 0_level_0,mean,max,min
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,63.633205,100,0
male,68.728216,100,27


In [30]:
data.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [31]:
# data.groupby('test_preparation_course')['math_score'].agg(['mean','max', 'min'])
data[data.math_score + data.reading_score + data.writing_score]

KeyError: "None of [Index([218, 247, 278, 148, 229, 232, 275, 122, 195, 148,\n       ...\n       242, 225, 207, 208, 188, 282, 172, 195, 223, 249],\n      dtype='int64', length=1000)] are in the [columns]"

In [None]:
data.groupby('test_preparation_course')[['math_score', 'reading_score', 'writing_score']].agg(['mean', 'max', 'min'])

In [None]:
data[data.test_preparation_course == 'completed'].describe()

In [None]:
data[data.test_preparation_course == 'none'].describe()

In [None]:
# Assuming your dataset is already loaded into 'data'
# Group by 'test_preparation_course' and calculate the mean scores
performance_comparison = data.groupby('test_preparation_course')[['math_score', 'reading_score', 'writing_score']].mean()

# Display the summary table
print(performance_comparison)

# Visualization
performance_comparison.plot(kind='bar', figsize=(10,6), colormap='viridis')
plt.title("Impact of Test Preparation Course on Student Performance")
plt.ylabel("Average Score")
plt.xlabel("Test Preparation Course")
plt.xticks(rotation=0)
plt.show()

In [None]:
data.head()

In [None]:
# jinke parent parhe lkhe hain us k marks ache ahin ya kum parhe lkhe k

In [None]:
data.parental_level_of_education.unique()

In [None]:
data[(data.parental_level_of_education == 'some college') | (data.parental_level_of_education == 'high school') | (data.parental_level_of_education == 'some high school')]

In [78]:
data[(data.parental_level_of_education == 'some college') | (data.parental_level_of_education == 'high school') | (data.parental_level_of_education == 'some high school')].describe()

Unnamed: 0,math_score,reading_score,writing_score
count,601.0,601.0,601.0
mean,64.419301,67.15807,65.579035
std,15.017009,14.63113,15.159513
min,0.0,17.0,10.0
25%,55.0,58.0,55.0
50%,65.0,68.0,67.0
75%,74.0,78.0,77.0
max,100.0,100.0,100.0


In [83]:
data['total'] = data.math_score + data.reading_score + data.writing_score

In [85]:
data[(data.parental_level_of_education == 'some college') | (data.parental_level_of_education == 'high school') | (data.parental_level_of_education == 'some high school')]

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total
1,female,group C,some college,standard,completed,69,90,88,247
4,male,group C,some college,standard,none,76,78,75,229
6,female,group B,some college,standard,completed,88,95,92,275
7,male,group B,some college,free/reduced,none,40,43,39,122
8,male,group D,high school,free/reduced,completed,64,64,67,195
...,...,...,...,...,...,...,...,...,...
994,male,group A,high school,standard,none,63,63,62,188
996,male,group C,high school,free/reduced,none,62,55,55,172
997,female,group C,high school,free/reduced,completed,59,71,65,195
998,female,group D,some college,standard,completed,68,78,77,223


In [86]:
data[(data.parental_level_of_education == 'some college') | (data.parental_level_of_education == 'high school') | (data.parental_level_of_education == 'some high school')]['total'].mean()

197.15640599001665

In [88]:
data[(data.parental_level_of_education == "bachelor's degree") | (data.parental_level_of_education == "master's degree") | (data.parental_level_of_education == "associate's degree")]['total'].mean()

212.58395989974937

### Testing in Statistics
  - t test
  - z test
  - f test
  - CHi sq
  - annova
#### these all comes under Hypothesis testing


##### Funnel Analysis

## Anova Test