# Practical 6 - Descriptive Statistics

In [46]:
# import required packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy.stats import kurtosis
from scipy.stats import skew

In [28]:
# load data
data_frame = pd.read_csv('FDS Datasets/stats.csv')
data_frame

Unnamed: 0,Name,Salary,Country
0,Dan,40000,USA
1,Elizabeth,32000,Brazil
2,Jon,45000,Italy
3,Maria,54000,USA
4,Mark,72000,USA
5,Bill,62000,Brazil
6,Jess,92000,Italy
7,Julia,55000,USA
8,Jeff,35000,Italy
9,Ben,48000,Brazil


# 1. Basic Statistics

In [29]:
# No. of salaries present
data_frame["Salary"].count()

10

In [30]:
# Cumulative salaries
data_frame["Salary"].sum()

535000

In [31]:
# Cumulative salaries of various countries
data_frame.groupby(["Country"])["Salary"].sum()

Country
Brazil    142000
Italy     172000
USA       221000
Name: Salary, dtype: int64

In [32]:
# Entry count of various countries
data_frame.groupby(["Country"]).count()

Unnamed: 0_level_0,Name,Salary
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Brazil,3,3
Italy,3,3
USA,4,4


# 2. Mean

In [33]:
data_frame["Salary"].mean()

53500.0

# 3. Median

In [34]:
data_frame["Salary"].median()

51000.0

# 4. Mode

In [35]:
data_frame["Salary"].mode()

0    32000
1    35000
2    40000
3    45000
4    48000
5    54000
6    55000
7    62000
8    72000
9    92000
dtype: int64

# 5. Variance

In [36]:
data_frame["Salary"].var()

332055555.5555556

# 6. Standard Deviation

In [37]:
data_frame["Salary"].std()

18222.391598128816

# 7. Skewness

In [38]:
skewness = data_frame["Salary"].skew()

print(skewness)

if skewness < 0:
    print("The distribution is negatively skewed")
elif skewness > 0:
    print("The distribution is positively skewed")
else:
    print("The distribution is not skewed")

1.021551304801318
The distribution is positively skewed


In [39]:
# Load the BirthWeight dataset
data_frame_2=pd.read_csv('FDS Datasets/BirthWeight.csv')
data_frame_2.head()

Unnamed: 0,Infant ID,Gestational Age (Weeks),Birth Weight (Grams)
0,1,34.7,1895
1,2,36.0,2030
2,3,29.3,1440
3,4,40.1,2835
4,5,35.7,3090


In [40]:
# Set Infant ID as the index
data_frame_2.set_index("Infant ID", inplace = True)
data_frame_2.head()


Unnamed: 0_level_0,Gestational Age (Weeks),Birth Weight (Grams)
Infant ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,34.7,1895
2,36.0,2030
3,29.3,1440
4,40.1,2835
5,35.7,3090


# 8. Covariance

In [41]:
data_frame_2.cov()

Unnamed: 0,Gestational Age (Weeks),Birth Weight (Grams)
Gestational Age (Weeks),9.96,1798.02
Birth Weight (Grams),1798.02,485478.75


# 9. Correlation

In [42]:
data_frame_2.corr(method = "pearson")

Unnamed: 0,Gestational Age (Weeks),Birth Weight (Grams)
Gestational Age (Weeks),1.0,0.82
Birth Weight (Grams),0.82,1.0


In [43]:
# Load the diamonds dataset
pd.set_option("display.max_columns",None)  # to display all the columns
pd.options.display.float_format = "{:,.2f}".format

data_frame_3 = pd.read_csv("FDS Datasets/diamonds.csv")
data_frame_3.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [44]:
data_frame_4 = data_frame_3.drop(["id"], axis = 1)
for column in data_frame_4:
    if data_frame_4[column].dtype == "object":
        data_frame_4.drop([column], axis = 1, inplace = True)

stats_of_data_frame_4 = data_frame_4.describe()
stats_of_data_frame_4.rename(index = {"50%":"Median/50%"}, inplace = True)
stats_of_data_frame_4

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.8,61.75,57.46,3932.8,5.73,5.73,3.54
std,0.47,1.43,2.23,3989.44,1.12,1.14,0.71
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
Median/50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [45]:
var = data_frame_4.var()

var_list = []
for col in data_frame_4.columns:
    if data_frame_4[col].dtype == "object":
        continue
    var_list.append(round(data_frame_4[col], 5))


data_frame_5 = pd.DataFrame([var_list], columns = stats_of_data_frame_4.columns, index = ["var"])
stats_of_data_frame_5 = stats_of_data_frame_4.append(data_frame_5)
stats_of_data_frame_5

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.00,53940.00,53940.00,53940.00,53940.00,53940.00,53940.00
mean,0.80,61.75,57.46,3932.80,5.73,5.73,3.54
std,0.47,1.43,2.23,3989.44,1.12,1.14,0.71
min,0.20,43.00,43.00,326.00,0.00,0.00,0.00
25%,0.40,61.00,56.00,950.00,4.71,4.72,2.91
Median/50%,0.70,61.80,57.00,2401.00,5.70,5.71,3.53
75%,1.04,62.50,59.00,5324.25,6.54,6.54,4.04
max,5.01,79.00,95.00,18823.00,10.74,58.90,31.80
var,0 0.23 1 0.21 2 0.23 3 ...,0 61.50 1 59.80 2 56.90 3 ...,0 55.00 1 61.00 2 65.00 3 ...,0 326 1 326 2 327 3 ...,0 3.95 1 3.89 2 4.05 3 ...,0 3.98 1 3.84 2 4.07 3 ...,0 2.43 1 2.31 2 2.31 3 ...
