In [9]:
# #1. A physician is evaluating a new diet for her patients with a family history of heart disease. 
# To test the effectiveness of this diet, 16 patients are placed on the diet for 6 months. 
# Their weights and triglyceride levels are measured before and after the study, 
# and the physician wants to know if either set of measurements has changed.

import numpy as np
import pandas as pd
import scipy.stats as stats

dietDF = pd.read_csv("dietstudy.csv")
display(dietDF.head())

##Step 1: Checking if sample truly represents the population or not::
#Yes


##Step 2: Defining Null & Alt Hypothesis::
#Null hypothesis: There is no difference in the levels of Triglycerides and weight of individual after using new diet 
#for 6 months. i.e.,  “the difference in the mean values is zero”

#Alt hypothesis: There is has been a significant difference in the levels of Triglycerides and weight of individual after 
#using new diet for 6 months.


##Step 3: Will decide what test to be done, based on the data::
#A paired sample t-test is used to determine whether there is a significant difference between the average values of the 
# same measurement made under two different conditions.


##Step 4: Will calculate the test statistic – it will be the Program Output::
#triglyceride levels
display('Test for -- triglyceride levels...')
triglyceride = stats.ttest_rel(a=dietDF.tg4, b = dietDF.tg0)
display(triglyceride)
print("The average triglyceride level of the customers at the start of the test is ${}".format(dietDF.tg0.mean()))
print("The average triglyceride level of the customers at the end of the test is ${}".format(dietDF.tg4.mean()))


##Step 5: Final conclusion based on p-value::
#Conclusion - triglyceride ::
# Across all 16 subjects, triglyceride levels dropped between 14 and 15 points on average after 6 months of the new diet.
#The significance value greater than 0.05 for change in triglyceride level shows the diet did not significantly 
#reduce their triglyceride levels.
display(triglyceride.pvalue > 0.05)
print('Conclusion: Since the p value fails to REJECT H0, will conclude that a==b, hence no effect of new diet.')

###################################################################################################################

##Weights....
display('Test for -- Weights...')
weight = stats.ttest_rel(a=dietDF.wgt4, b = dietDF.wgt0)
display(weight)
print("The average weight of the customers at the start of the test ${}".format(dietDF.wgt0.mean()))
print("The average weight of the customers at the end of the test ${}".format(dietDF.wgt4.mean()))


##Step 5: Final conclusion based on p-value::
#Conclusion - Weight::
# The subjects clearly lost weight over the course of the study; on average, about 8 units.
#Since the significance value for change in weight is less than 0.05, we can conclude that the average loss 
#of 8.06 units per patient is not due to chance variation, and can be attributed to the diet.
display(weight.pvalue > 0.05)
print('Conclusion: Since the p value supports the evidence of rejecting H0, will conclude that a>b, hence \
    had a good effect of new diet.')


Unnamed: 0,patid,age,gender,tg0,tg1,tg2,tg3,tg4,wgt0,wgt1,wgt2,wgt3,wgt4
0,1,45,Male,180,148,106,113,100,198,196,193,188,192
1,2,56,Male,139,94,119,75,92,237,233,232,228,225
2,3,50,Male,152,185,86,149,118,233,231,229,228,226
3,4,46,Female,112,145,136,149,82,179,181,177,174,172
4,5,64,Male,156,104,157,79,97,219,217,215,213,214


'Test for -- triglyceride levels...'

Ttest_relResult(statistic=-1.2000008533342437, pvalue=0.24874946576903698)

True

The average triglyceride level of the customers at the start of the test is $138.4375
The average triglyceride level of the customers at the end of the test is $124.375
Conclusion: Since the p value fails to REJECT H0, will conclude that a==b, hence the campaign had no effect of       new diet.


'Test for -- Weights...'

Ttest_relResult(statistic=-11.174521688532522, pvalue=1.137689414996614e-08)

False

The average weight of the customers at the start of the test $198.375
The average weight of the customers at the end of the test $190.3125
Since the p value supports the evidence of rejecting H0, will conclude that a>b, hence the campaign     had a good effect of new diet.


In [11]:
##2. An analyst at a department store wants to evaluate a recent credit card promotion. To this end, 
# 500 cardholders were randomly selected. Half received an ad promoting a reduced interest rate on purchases 
# made over the next three months, and half received a standard seasonal ad. Is the promotion effective to increase sales?

creditDF = pd.read_csv("creditpromo.csv")
display(creditDF.head())

##Step 1: Checking if sample truly represents the population or not::
#Yes


##Step 2: Defining Null & Alt Hypothesis::
#Null hypothesis: There is no difference in ad promotion.
#Alt hypothesis: There is has been a significant difference in ad promotion.


##Step 3: Will decide what test to be done, based on the data::
#The independent two-sample t-test is used to test whether population means are significantly different from each other.
#Independent samples are randomly drawn using the means.


##Step 4: Will calculate the test statistic – it will be the Program Output::
std_promo = creditDF[(creditDF['insert'] == 'Standard')]
new_promo = creditDF[(creditDF['insert'] == 'New Promotion')]

print("The average spent for std_promo ${}".format(std_promo.mean()))
print("The average spent for new_promo ${}".format(new_promo.mean()))

uneq_var = stats.ttest_ind(a = std_promo.dollars, b= new_promo.dollars, equal_var=False)  #Different variance
display(uneq_var.statistic)
display(uneq_var.pvalue)

eq_var = stats.ttest_ind(a = std_promo.dollars, b= new_promo.dollars, equal_var=True)  #Equal variance
display(eq_var.statistic)
display(eq_var.pvalue)

##Step 5: Final conclusion based on p-value::
##Conclusion ::
display(eq_var.pvalue > 0.05)
print('Conclusion: Since the p value supports the evidence of rejecting H0, will conclude that std_promo != new_promo. \
    Yes, there is a significant effect of new_promo.')

Unnamed: 0,id,insert,dollars
0,148,Standard,2232.771979
1,572,New Promotion,1403.807542
2,973,Standard,2327.092181
3,1096,Standard,1280.030541
4,1541,New Promotion,1513.5632


The average spent for std_promo $id         69003.000000
dollars     1566.389031
dtype: float64
The average spent for new_promo $id         64998.244000
dollars     1637.499983
dtype: float64


-2.260422726464996

0.024226348191648997

-2.2604227264649963

0.024225996894147814

False

Since the p value supports the evidence of rejecting H0, will conclude that a>b, hence the campaign     had a good effect of new diet.


In [13]:
# #3. An experiment is conducted to study the hybrid seed production of bottle gourd under open field conditions. 
# The main aim of the investigation is to compare natural pollination and hand pollination. 
# The data are collected on 10 randomly selected plants from each of natural pollination and hand pollination. 
# The data are collected on fruit weight (kg), seed yield/plant (g) and seedling length (cm). (Data set: pollination.csv)
# a. Is the overall population of Seed yield/plant (g) equals to 200?
# b. Test whether the natural pollination and hand pollination under open field conditions are equally effective or are 
# significantly different.

pollinationDF = pd.read_csv("pollination.csv")
display(pollinationDF.head())

##Step 1: Checking if sample truly represents the population or not::
#Yes


##Step 2: Defining Null & Alt Hypothesis::
#Null hypothesis: There is no significance of production under open field conditions.
#Alt hypothesis: There is a significance of production under open field conditions.


##Step 3: Will decide what test to be done, based on the data::
#The independent two-sample t-test is used to test whether population means are significantly different from each other.
#Independent samples are randomly drawn using the means.


##Step 4: Will calculate the test statistic – it will be the Program Output::
print("The overall population of Seed yield/plant: {}".format(pollinationDF.Seed_Yield_Plant.mean()))

natural_promo = pollinationDF[(pollinationDF['Group'] == 'Natural')]
hand_promo = pollinationDF[(pollinationDF['Group'] == 'Hand')]

print("The average of natural_promo: {}".format(natural_promo.mean()))
print("The average of hand_promo: {}".format(hand_promo.mean()))

uneq_var_Fruit_Wt = stats.ttest_ind(a = natural_promo.Fruit_Wt, b= hand_promo.Fruit_Wt, equal_var=False)  #Different variance
display(uneq_var_Fruit_Wt.statistic)

eq_var_Fruit_Wt = stats.ttest_ind(a = natural_promo.Fruit_Wt, b= hand_promo.Fruit_Wt, equal_var=True)  #Equal variance
display(eq_var_Fruit_Wt.statistic)
print("p-value of Fruit_Wt: {}".format(eq_var_Fruit_Wt.pvalue))


uneq_var_Seed_Yield_Plant = stats.ttest_ind(a = natural_promo.Seed_Yield_Plant, b= hand_promo.Seed_Yield_Plant, equal_var=False)  #Different variance
display(uneq_var_Seed_Yield_Plant.statistic)

eq_var_Seed_Yield_Plant= stats.ttest_ind(a = natural_promo.Seed_Yield_Plant, b= hand_promo.Seed_Yield_Plant, equal_var=True)  #Equal variance
display(eq_var_Seed_Yield_Plant.statistic)
print("p-value of Seed_Yield_Plant: {}".format(eq_var_Seed_Yield_Plant.pvalue))

uneq_var_Seedling_length = stats.ttest_ind(a = natural_promo.Seedling_length, b= hand_promo.Seedling_length, equal_var=False)  #Different variance
display(uneq_var_Seedling_length.statistic)

eq_var_Seedling_length = stats.ttest_ind(a = natural_promo.Seedling_length, b= hand_promo.Seedling_length, equal_var=True)  #Equal variance
display(eq_var_Seedling_length.statistic)
print("p-value of Seedling_length: {}".format(eq_var_Seedling_length.pvalue))


##Step 5: Final conclusion based on p-value::
#Conclusion::
print('a. The overall population of Seed yield/plant: 180.8035 < 200')
display(eq_var_Fruit_Wt.pvalue > 0.05)
print('b.Conclusion: Since the p value supports the evidence of rejecting H0, will conclude that \
    Natural Seed_Yield_Plant == Hand Seed_Yield_Plant. No significant difference.')

display(eq_var_Seedling_length.pvalue > 0.05)
print('b.Conclusion: Since the p value supports the evidence of rejecting H0, will conclude that \
    Natural Seedling_length == Hand Seedling_length. No significant difference.')

Unnamed: 0,Group,Fruit_Wt,Seed_Yield_Plant,Seedling_length
0,Natural,1.85,147.7,16.86
1,Natural,1.86,136.86,16.77
2,Natural,1.83,149.97,16.35
3,Natural,1.89,172.33,18.26
4,Natural,1.8,144.46,17.9


The overall population of Seed yield/plant: 180.8035
The average of natural_promo: Fruit_Wt              1.848
Seed_Yield_Plant    146.009
Seedling_length      17.707
dtype: float64
The average of hand_promo: Fruit_Wt              2.566
Seed_Yield_Plant    215.598
Seedling_length      18.590
dtype: float64


-17.669989614440286

-17.669989614440286

p-value of Fruit_Wt: 8.078362076486221e-13


-13.958260515902547

-13.958260515902547

p-value of Seed_Yield_Plant: 4.271481585484385e-11


-2.542229999657055

-2.542229999657055

p-value of Seedling_length: 0.020428817064110226
a. The overall population of Seed yield/plant: 180.8035 < 200


False

b. Since the p value supports the evidence of rejecting H0, will conclude that     Natural Seed_Yield_Plant == Hand Seed_Yield_Plant. No significant difference.


False

b. Since the p value supports the evidence of rejecting H0, will conclude that     Natural Seedling_length == Hand Seedling_length. No significant difference.


In [17]:
# #4. An electronics firm is developing a new DVD player in response to customer requests. 
# Using a prototype, the marketing team has collected focus data for different age groups viz. 
# Under 25; 25-34; 35-44; 45-54; 55-64; 65 and above. Do you think that consumers of various ages rated the design differently?

dvdDF = pd.read_csv("dvdplayer.csv")
display(dvdDF)

##Step 1: Checking if sample truly represents the population or not::
#Yes


##Step 2: Defining Null & Alt Hypothesis::
#Null hypothesis: There is no significance of consumers of various ages.
#Alt hypothesis: There is a significance of consumers of various ages.


##Step 3: Will decide what test to be done, based on the data::
#Perform the ANOVA: A group/categorical variable is influencing a continuous variable.


##Step 4: Will calculate the test statistic – it will be the Program Output::
SixtyFivePlus_promo = dvdDF.dvdscore.loc[dvdDF.agegroup == '65 and over']
FiftyFiveToSixtyFour_promo = dvdDF.dvdscore.loc[dvdDF.agegroup == '55-64']
FortyFiveToFiftyFour_promo = dvdDF.dvdscore.loc[dvdDF.agegroup == '45-54']
ThirtyFiveToFortyFour_promo = dvdDF.dvdscore.loc[dvdDF.agegroup == '35-44']
TwentyFiveToThirtyFour_promo = dvdDF.dvdscore.loc[dvdDF.agegroup == '25-34']
Under25_promo =dvdDF.dvdscore.loc[dvdDF.agegroup == 'Under 25']

anova = stats.f_oneway(Under25_promo,TwentyFiveToThirtyFour_promo,ThirtyFiveToFortyFour_promo,
               FortyFiveToFiftyFour_promo,FiftyFiveToSixtyFour_promo,SixtyFivePlus_promo)
print("The average of dvdscore: {}".format(dvdDF.dvdscore.mean()))
print("The STD of dvdscore: {}".format(dvdDF.dvdscore.std()))
print("p-value".format(anova.pvalue))

##Step 5: Final conclusion based on p-value::
#Conclusion ::
display(anova.pvalue > 0.05)
print('Conclusion: Since the p-value supports the evidence of rejecting H0, will conclude that there is a \
similarity among all age groups. No significant difference.')


Unnamed: 0,agegroup,dvdscore
0,65 and over,38.454803
1,55-64,17.669677
2,65 and over,31.704307
3,65 and over,25.924460
4,Under 25,30.450007
...,...,...
63,45-54,46.567682
64,65 and over,23.999491
65,Under 25,24.994419
66,65 and over,33.538502


The average of dvdscore: 31.95076354936375
The STD of dvdscore: 7.31999223969135
p-value


False

Conclusion: Since the p-value supports the evidence of rejecting H0, will conclude that there is a similarity among all age groups. No significant difference.


In [22]:
# #5. A survey was conducted among 2800 customers on several demographic characteristics. 
# Working status, sex, age, age-group, race, happiness, no. of child, marital status, educational qualifications, 
# income group etc. had been captured for that purpose. (Data set: sample_survey.csv).

# a. Is there any relationship in between labour force status with marital status?
# b. Do you think educational qualification is somehow controlling the marital status?
# c. Is happiness is driven by earnings or marital status?

surveyDF = pd.read_csv("sample_survey.csv")
surveyDF.head()

##Step 1: Checking if sample truly represents the population or not::
#Yes


##Step 2: Defining Null & Alt Hypothesis::
#Null hypothesis: There is no relationship between categorical variables.
#Alt hypothesis: There is a relationship between categorical variables.


##Step 3: Will decide what test to be done, based on the data::
#Chi-Square: Influence of one categorical variable on another


##Step 4: Will calculate the test statistic – it will be the Program Output::
# a. Is there any relationship in between labour force status with marital status?
wrkstat_marital_xtab = pd.crosstab(surveyDF.wrkstat,surveyDF.marital,margins=True)
a = stats.chi2_contingency(observed=wrkstat_marital_xtab)
display(a)

# b. Do you think educational qualification is somehow controlling the marital status?
degree_marital_xtab = pd.crosstab(surveyDF.degree,surveyDF.marital,margins=True)
b = stats.chi2_contingency(observed=degree_marital_xtab)
display(b)

# c. Is happiness driven by earnings or marital status?
happy_income_xtab = pd.crosstab(surveyDF.happy,surveyDF.income,margins=True)
c1 = stats.chi2_contingency(observed=happy_income_xtab)
display(c1)

happy_marital_xtab = pd.crosstab(surveyDF.happy,surveyDF.marital,margins=True)
c2 = stats.chi2_contingency(observed=happy_marital_xtab)
display(c2)

##Step 5: Final conclusion based on p-value::
##Conclusion ::
print('Conclusion: a. Since 1.82 > 0.05 , the p-value failed to reject H0, will conclude that there is no influence.')
print('Conclusion: b. Since 7.4 > 0.05 , the p-value failed to reject H0, will conclude that there is no influence.')
print('Conclusion: c1. Since 7.2 > 0.05 , the p-value failed to reject H0, will conclude that there is no influence.')
print('Conclusion: c2 Since 7.7 > 0.05 , the p-value failed to reject H0, will conclude that there is no influence.')



(729.2421426572284,
 1.820339965538765e-127,
 40,
 array([[5.16918728e+01, 1.55886926e+02, 7.68424028e+01, 1.07787986e+01,
         3.28000000e+01, 3.28000000e+02],
        [8.51024735e+00, 2.56643110e+01, 1.26508834e+01, 1.77455830e+00,
         5.40000000e+00, 5.40000000e+01],
        [6.20932862e+01, 1.87254417e+02, 9.23045936e+01, 1.29477032e+01,
         3.94000000e+01, 3.94000000e+02],
        [1.24501767e+01, 3.75459364e+01, 1.85077739e+01, 2.59611307e+00,
         7.90000000e+00, 7.90000000e+01],
        [7.24946996e+00, 2.18621908e+01, 1.07766784e+01, 1.51166078e+00,
         4.60000000e+00, 4.60000000e+01],
        [9.14063604e+00, 2.75653710e+01, 1.35879859e+01, 1.90600707e+00,
         5.80000000e+00, 5.80000000e+01],
        [2.46954770e+02, 7.44740283e+02, 3.67109894e+02, 5.14950530e+01,
         1.56700000e+02, 1.56700000e+03],
        [4.79095406e+01, 1.44480565e+02, 7.12197880e+01, 9.99010601e+00,
         3.04000000e+01, 3.04000000e+02],
        [4.46000000e+02, 1.345

(122.68449020508541,
 7.424404099753273e-15,
 25,
 array([[  75.06345268,  227.39312301,  111.83268345,   15.75824176,
           47.95249911,  478.        ],
        [  32.19248493,   97.52215526,   47.9617157 ,    6.75824176,
           20.56540234,  205.        ],
        [ 235.55476781,  713.57674583,  350.9393832 ,   49.45054945,
          150.4785537 , 1500.        ],
        [  32.66359447,   98.94930876,   48.66359447,    6.85714286,
           20.86635945,  208.        ],
        [  67.52570011,  204.55866714,  100.60262318,   14.17582418,
           43.1371854 ,  430.        ],
        [ 443.        , 1342.        ,  660.        ,   93.        ,
          283.        , 2821.        ]]))

(178.95053061216427,
 7.234749067043371e-21,
 36,
 array([[   3.89520355,   23.12777106,   21.66706973,   29.82265216,
          191.35187424,    2.92140266,    3.89520355,    4.26037888,
            4.01692866,    5.72108021,    7.06005643,    4.26037888,
          302.        ],
        [  18.16041919,  107.82748892,  101.01733172,  139.04070939,
          892.1305925 ,   13.62031439,   18.16041919,   19.86295848,
           18.72793229,   26.67311568,   32.91575977,   19.86295848,
         1408.        ],
        [   9.94437727,   59.04474002,   55.31559855,   76.13663845,
          488.51753325,    7.45828295,    9.94437727,   10.87666264,
           10.25513906,   14.60580411,   18.0241838 ,   10.87666264,
          771.        ],
        [  32.        ,  190.        ,  178.        ,  245.        ,
         1572.        ,   24.        ,   32.        ,   35.        ,
           33.        ,   47.        ,   58.        ,   35.        ,
         2481.        ]]))

(260.68943894182826,
 7.762777322980048e-47,
 15,
 array([[  53.6969697 ,  162.06060606,   79.27272727,   11.15151515,
           33.81818182,  340.        ],
        [ 248.58538324,  750.24527629,  366.98609626,   51.62495544,
          156.55828877, 1574.        ],
        [ 140.71764706,  424.69411765,  207.74117647,   29.22352941,
           88.62352941,  891.        ],
        [ 443.        , 1337.        ,  654.        ,   92.        ,
          279.        , 2805.        ]]))

Conclusion: a. Since 1.82 > 0.05 , the p-value failed to reject H0, will conclude that there is no influence.
Conclusion: b. Since 7.4 > 0.05 , the p-value failed to reject H0, will conclude that there is no influence.
Conclusion: c1. Since 7.2 > 0.05 , the p-value failed to reject H0, will conclude that there is no influence.
Conclusion: c2 Since 7.7 > 0.05 , the p-value failed to reject H0, will conclude that there is no influence.
