In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import plotly.express as px

### Importing Data in Python

In [2]:
df = pd.read_csv("practice.csv")

### Exploring the Dataset

In [3]:
df.head()

Unnamed: 0,Group,Treatment,Age,Cholesterol
0,I,A,74,6.7
1,I,A,68,6.7
2,II,B,21,5.0
3,I,B,66,3.7
4,III,A,37,10.3


In [4]:
df.shape

(300, 4)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Group        300 non-null    object 
 1   Treatment    300 non-null    object 
 2   Age          300 non-null    int64  
 3   Cholesterol  300 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 9.5+ KB


In [6]:
df.Age

0       74
1       68
2       21
3       66
4       37
      ... 
295     68
296     46
297     70
298      2
299    103
Name: Age, Length: 300, dtype: int64

In [7]:
df.Group = df.Group.astype("category")
df.Treatment = df.Treatment.astype("category")

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   Group        300 non-null    category
 1   Treatment    300 non-null    category
 2   Age          300 non-null    int64   
 3   Cholesterol  300 non-null    float64 
dtypes: category(2), float64(1), int64(1)
memory usage: 5.6 KB


In [9]:
df.Age.min()

2

In [10]:
df.Age.max()

104

In [11]:
df.Treatment.value_counts()

A    151
B    149
Name: Treatment, dtype: int64

In [12]:
pd.crosstab(df.Group, df.Treatment)

Treatment,A,B
Group,Unnamed: 1_level_1,Unnamed: 2_level_1
I,50,51
II,50,50
III,51,48


In [18]:
df[df.Age < 18]

Unnamed: 0,Group,Treatment,Age,Cholesterol
7,III,B,13,9.1
76,III,A,10,8.6
97,III,A,14,5.5
110,II,B,16,4.6
118,I,A,10,5.7
173,II,A,13,6.7
280,I,A,17,5.0
298,I,A,2,4.3


### Descriptive Statistics

#### Measures of Central Tendency

In [19]:
df.Cholesterol.mean()

6.007333333333333

In [20]:
df.Age.mean()

51.61333333333334

In [21]:
df.Age.mode()

0    41
1    42
2    52
dtype: int64

In [25]:
df[df.Age == 52].shape

(9, 4)

In [26]:
df.Age.median()

52.0

#### Measures of Dispersion

In [28]:
df.Cholesterol.std()

1.9166615578430293

In [29]:
df.Cholesterol.var()

3.673591527313268

In [32]:
df.Cholesterol.std()**2

3.6735915273132678

### T-test

In [34]:
stats.levene(df[df.Treatment == "A"].Cholesterol, df[df.Treatment == "B"].Cholesterol)

LeveneResult(statistic=1.0400768536663356, pvalue=0.30863082388124846)

In [36]:
stats.ttest_ind(df[df.Treatment == "A"].Cholesterol, df[df.Treatment == "B"].Cholesterol, equal_var = True)

Ttest_indResult(statistic=1.6429180984069158, pvalue=0.10145456832306156)

In [38]:
px.box(df, x= "Treatment", y= "Cholesterol", color = "Group")

In [42]:
px.scatter(df, x= "Age", y= "Cholesterol", trendline = "ols")

In [41]:
df.Age.corr(df.Cholesterol, method = "pearson")

0.07854733899208126