# Descriptive Statistics

In [None]:
#importing the basic libraries
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
import warnings

# Suppress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
#read the CSV file
df = pd.read_csv("superstore.csv")

In [None]:
df.head()

In [None]:
df.shape

## Univariate Analysis

In [None]:
df['Sales'].describe()

In [None]:
## Central Tendency (Mean)

#calculate the mean
print("Mean: ",df['Sales'].mean())

#convert to integer
print("Integerized Mean: ", int(df['Sales'].mean()))

#in the standard format
print("Standardized Mean: ","{:,}".format(int(df['Sales'].mean())))

In [None]:
# Mean is on the higher side due to outliers
#Let's calculate the Median (central value)

print("Median: ", df['Sales'].median())

In [None]:
#Mode for categorical variables
df['Ship Mode'].unique()

In [None]:
df['Ship Mode'].mode()

In [None]:
df.describe(include='all')

## Biivariate Analysis

In [None]:
# Setting styles - You can skip this, this will add gridlines for better readability.
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))

In [None]:
#sns.scatterplot(data=df, x="Order ID", y="Sales")


In [None]:
sns.scatterplot(data=df, x="Sales", y="Profit")

# Descriptive Analytics using Visualizations in Python

### Histogram

A histogram is a classic visualization tool that represents the distribution of one or more variables by counting the number of observations that fall within discrete bins.

In [None]:
sns.histplot(df["Sales"])

In [None]:
# x- axis 
sns.boxplot(x=df['Sales']) 

In [None]:
# y-axis
sns.boxplot(y=df['Sales'])

In [None]:
sns.histplot(df["Profit"])

In [None]:
sns.boxplot(x=df["Profit"])

In [None]:
sns.histplot(df["Ship Mode"])

### Distribution Plot

A Distplot or distribution plot, depicts the variation in the data distribution.
The distplot represents the univariate distribution of data i.e. data distribution of a variable against the density distribution. The curve represents the relative likelihood of different values occurring in the dataset.

In [None]:
sns.distplot(df['Sales'])
print("Skewness: %f" % df['Sales'].skew())
print("Kurtosis: %f" % df['Sales'].kurt())

In [None]:
sns.distplot(df['Profit'])
print("Skewness: %f" % df['Profit'].skew())
print("Kurtosis: %f" % df['Profit'].kurt())

In [None]:
sns.distplot(df['Quantity'])
print("Skewness: %f" % df['Quantity'].skew())
print("Kurtosis: %f" % df['Quantity'].kurt())

### Box Plot

Boxplots are used to visualize distributions. Thats very useful when you want to compare data between two groups. Sometimes a boxplot is named a box-and-whisker plot.

In [None]:
sns.boxplot(x=df["Sales"])

In [None]:
sns.boxplot(x=df["Quantity"])

### Pair Plots using Seaborn

Pairplot is used to describe pairwise relationships in a dataset. Pairplot is used to visualize the univariate distribution of all variables in a dataset along with all of their pairwise relationships.
The diagonal plots are histograms and all the other plots are scatter plots.

In [None]:
sns.pairplot(df)

# Y-Data Profiling

In [None]:
#!pip install ydata-profiling
# previously this was pandas profiling

In [None]:
#y-data profiling
from ydata_profiling import ProfileReport
prof = ProfileReport(df)
prof.to_file(output_file='output.html')

# More Examples

### Box and Whiskers Plot

In [None]:
#creating a dataframe
df = pd.DataFrame([
 { 'name': 'Ahmed', 'salary': 1200, 'age': 22} ,
 { 'name': 'Saleh', 'salary': 22000, 'age': 33},
 { 'name': 'Hashir', 'salary': 25000, 'age': 44 },
 { 'name': 'Qasim', 'salary': 35000, 'age': 23 },
 { 'name': 'Junaid', 'salary': 25000, 'age': 42 },
 { 'name': 'Ammaar', 'salary': 15000, 'age': 32 },
 { 'name': 'Bill Gates', 'salary': 300000, 'age': 52 }    
])

#print it
df

In [None]:
#df.boxplot(column='salary', sym='o', returnaxes')
df.boxplot(column='salary', sym='o', return_type='axes')

In [None]:
#Let's try to remove Bill gates and see the plot
mask = df['name'].isin(['Bill Gates'])
df1 = df[~mask]

df1.boxplot(column='salary', sym='o', return_type='axes')

In [None]:
#if we want to set whiskers at max and min (to identify the outlier range)
df1.boxplot(column='salary', sym='o', return_type='axes')

## Scatter Plot

In [None]:
import matplotlib.pyplot as plt

# Create data
N = 20
x = np.random.rand(N)
y = np.random.rand(N)
print(x)
print(y)
colors = [[1,0,0]] #this means black. try out different values of RGB between 0 and 1
area = np.pi*10 #this is the size of each dot. try out different values

# Plot
plt.scatter(x, y, s = area, c=colors, alpha=0.9) #alpha is the color intensity. try out different values
plt.title('Scatter Plot Example')
plt.xlabel('x label')
plt.ylabel('y label')
plt.show()

In [None]:
plt.scatter(df1.age, df1.salary, s = area, c=colors, alpha=0.9) #alpha is the color intensity. try out different values

## Variance and Standard Deviation

In [None]:
list_one = pd.Series([-10, 0, 10, 20, 30])
list_two = pd.Series([8, 9, 10, 11, 12])
print("list one is")
print(list_one)
print("list two is")
print(list_two)

In [None]:
print("The mean of list_one is", list_one.mean())
print("The mean of list_two is", list_two.mean())
print("The median of list_one is", list_one.median())
print("The median of list_two is", list_two.median())

In [None]:
# ddof=0 has to do with sample variance
print(list_one.var(ddof=1))
print(list_two.var(ddof=1))

In [None]:
import math
# Data points [-10, 0, 10, 20, 30]
# Variance: 200
sd1 = math.sqrt(250)
sd1

In [None]:
# Data points [8, 9, 10, 11, 12]
# Variance: 2.0
sd2 = math.sqrt(2)
sd2

In [None]:
#define a class to plot the normal distribution curve
class norm1:
    #constructor (mean, sd, density measure)
    def __init__(self, a1, b1, c1):
        self.a1 = a1
        self.b1 = b1
        self.c1 = c1
    
    #normal curve plotting function
    def dist_curve(self):
        plt.plot(self.c1, 1/(self.b1 * np.sqrt(2 * np.pi)) *
            np.exp( - (self.c1 - self.a1)**2 / (2 * self.b1**2) ), linewidth=2, color='y')
        plt.show()

c = np.random.normal(list_one.mean(), sd1, 1000)
w1, x1, z1 = plt.hist(c, 60, density=True) #hist
print(x1)

In [None]:
hist1 = norm1(list_one.mean(), sd1, x1)
plot1 = hist1.dist_curve()

In [None]:
sns.distplot(list_one)

In [None]:
#let's see the salary plots
varsal = df['salary'].var(ddof=1)
sdsal = math.sqrt(varsal)

print(sdsal)
print(df['salary'].mean())

In [None]:
df.salary.hist()

In [None]:
df1.salary.hist()