# Case Study on ANOVA
 XYZ Company has offices in four different zones. The company wishes to
investigate the following :

● The mean sales generated by each zone.

● Total sales generated by all the zones for each month.

● Check whether all the zones generate the same amount of sales.

Help the company to carry out their study with the help of data provided.

In [136]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import scipy

In [77]:
data=pd.read_csv("E:\PAATSHAALA\Assignments\Sales_data_zone_wise.csv")

In [78]:
data.head(5)

Unnamed: 0,Month,Zone - A,Zone - B,Zone - C,Zone - D
0,Month - 1,1483525,1748451,1523308,2267260
1,Month - 2,1238428,1707421,2212113,1994341
2,Month - 3,1860771,2091194,1282374,1241600
3,Month - 4,1871571,1759617,2290580,2252681
4,Month - 5,1244922,1606010,1818334,1326062


In [79]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Month     29 non-null     object
 1   Zone - A  29 non-null     int64 
 2   Zone - B  29 non-null     int64 
 3   Zone - C  29 non-null     int64 
 4   Zone - D  29 non-null     int64 
dtypes: int64(4), object(1)
memory usage: 1.3+ KB


In [80]:
data['Month']=data['Month'].astype('str')

In [81]:
data.describe()

Unnamed: 0,Zone - A,Zone - B,Zone - C,Zone - D
count,29.0,29.0,29.0,29.0
mean,1540493.0,1755560.0,1772871.0,1842927.0
std,261940.1,168389.9,333193.7,375016.5
min,1128185.0,1527574.0,1237722.0,1234311.0
25%,1305972.0,1606010.0,1523308.0,1520406.0
50%,1534390.0,1740365.0,1767047.0,1854412.0
75%,1820196.0,1875658.0,2098463.0,2180416.0
max,2004480.0,2091194.0,2290580.0,2364132.0


# ● The mean sales generated by each zone.

In [82]:
data.shape

(29, 5)

In [83]:
data.isnull().sum() 

Month       0
Zone - A    0
Zone - B    0
Zone - C    0
Zone - D    0
dtype: int64

In [84]:
data.mean()

Zone - A    1.540493e+06
Zone - B    1.755560e+06
Zone - C    1.772871e+06
Zone - D    1.842927e+06
dtype: float64

In [85]:
data.sum()

Month       Month - 1Month - 2Month - 3Month - 4Month - 5M...
Zone - A                                             44674301
Zone - B                                             50911228
Zone - C                                             51413260
Zone - D                                             53444876
dtype: object

### In the order of highest mean to lowest, Zone - A tops the table followed by Zone-B and ZOne -C 
### i.e. with sales going higher mean is also increasing.

# ● Total sales generated by all the zones for each month.

In [86]:
df=pd.DataFrame(data)

In [87]:
df['Month']=df['Month'].astype(str)

In [126]:
df.groupby(by='Month').sum().sort_values(['Zone - A','Zone - B','Zone - C','Zone - D'],ascending=False)

Unnamed: 0_level_0,Zone - A,Zone - B,Zone - C,Zone - D
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Month - 11,2004480,1740365,1317869,1970069
Month - 23,1887694,1992155,1348387,1459683
Month - 24,1880820,1752873,2098463,2052591
Month - 4,1871571,1759617,2290580,2252681
Month - 3,1860771,2091194,1282374,1241600
Month - 10,1852450,1913059,1754314,1608387
Month - 14,1821799,1844081,1617376,1872259
Month - 7,1820196,1992031,1786826,1688055
Month - 18,1770324,1699213,1249821,2364132
Month - 9,1652644,1873402,1755290,1422059


In [127]:
df2=df.groupby(by='Month').sum().sort_values(['Zone - A','Zone - B','Zone - C','Zone - D'],ascending=False)

In [128]:
df2['Total']=df2['Zone - A']+df2['Zone - B']+df2['Zone - C']+df2['Zone - D']

In [129]:
df2.groupby(by='Month').sum().sort_values(['Total'],ascending=False)

Unnamed: 0_level_0,Zone - A,Zone - B,Zone - C,Zone - D,Total
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Month - 4,1871571,1759617,2290580,2252681,8174449
Month - 8,1625696,1665534,2161754,2363315,7816299
Month - 24,1880820,1752873,2098463,2052591,7784747
Month - 22,1481619,1527574,2255729,2295079,7560001
Month - 28,1616640,1547991,2128022,2178267,7470920
Month - 21,1570152,2013615,1540016,2265814,7389597
Month - 7,1820196,1992031,1786826,1688055,7287108
Month - 14,1821799,1844081,1617376,1872259,7155515
Month - 2,1238428,1707421,2212113,1994341,7152303
Month - 6,1534390,1573128,1751825,2292044,7151387


### Month 4 made maximum sales across all zones.
### ZOne D made maximum sales when considering zonewise sales.


# ● Check whether all the zones generate the same amount of sales.

### Null Hypothesis: all the zones generate the same amount of sales 
### Alternate Hyothesis: all the zones didn't generate the same amount of sales

In [133]:
import scipy.stats as stats

F, p = stats.f_oneway(data['Zone - A'], data['Zone - B'], data['Zone - C'], data['Zone - D'])

In [134]:
print(F,p)

5.672056106843581 0.0011827601694503335


### Insights -- The p-value is less than 0.05.
### Null value is rejected. 
### i.e. all the zones generate different amount of sales