In [1]:
import numpy as np
from scipy import stats
%matplotlib inline
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
import pandas as pd
import statsmodels.api as sm
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

  from pandas.core import datetools


## Class 06 - Examples

<b>Example 7.4</b><br>
Using the following data:
$$\overline{x} = 101.8, \ s_{x} = 63.2, \  \overline{y} = 101.8, \ s_{y} = 5.46, \ R = -0.499$$
Calculate the slope of the regression line against family income

In [2]:
mean_x = 101.8
sd_x = 63.2
mean_y = 101.8
sd_y = 5.46
r = -0.499

slope = (sd_y / sd_x) * r
print("Slope is %.4f" % slope)

Slope is -0.0431


<b>Problem 7.6 - Husbands and Wives</b><br>
The Great Britain Office of Population Census and Surveys once collected data on a random sample of 170 married couples in Britain, recording the age (in years) and heights (converted here to inches) of the husbands and wives.16 The scatterplot on page 358 shows the wife’s age plotted against her husband’s age, and the plot on the right shows wife’s height plotted against husband’s height.

In [3]:
hw_df = pd.read_csv("husbandsWives.csv")
hw_df.head()

Unnamed: 0,Age_Husband,Ht_Husband,Age_Wife,Ht_Wife,Age_Husb_at_Marriage,Years_Married,Age_Wife_At_Marriage,Duration
0,49,1809,43.0,1590,25.0,24.0,19.0,>20
1,25,1841,28.0,1560,19.0,6.0,22.0,<= 20
2,40,1659,30.0,1620,38.0,2.0,28.0,<= 20
3,52,1779,57.0,1540,26.0,26.0,31.0,>20
4,58,1616,52.0,1420,30.0,28.0,24.0,>20


In [4]:
# Do the same using plotly
trace = go.Scatter(
    x = hw_df["Age_Husband"],
    y = hw_df["Age_Wife"],
    mode = 'markers'
)

layout= go.Layout(
    title= 'Husband Ave vs Wife Age',
    xaxis= dict(
        title= 'Husband Age',
    ),
    yaxis=dict(
        title= 'Wife Age',
    ),
    showlegend= False
)

fig= go.Figure(data=[trace], layout=layout)
iplot(fig)


In [5]:
# Do the same using plotly
trace = go.Scatter(
    x = hw_df["Ht_Husband"],
    y = hw_df["Ht_Wife"],
    mode = 'markers'
)

layout= go.Layout(
    title= 'Husband Height vs Wife Height',
    xaxis= dict(
        title= 'Husband Height',
    ),
    yaxis=dict(
        title= 'Wife Height',
    ),
    showlegend= False
)

fig= go.Figure(data=[trace], layout=layout)
iplot(fig)

In [6]:
# Finding the correlations
df_temp = hw_df.loc[(hw_df["Ht_Husband"] > 0) & (hw_df["Ht_Wife"] > 0)]
print(stats.pearsonr(df_temp["Ht_Husband"], df_temp["Ht_Wife"])[0])

df_temp = hw_df.loc[(hw_df["Age_Husband"] > 0) & (hw_df["Age_Wife"] > 0)]
print(stats.pearsonr(df_temp["Age_Husband"], df_temp["Age_Wife"])[0])

0.364433700424
0.938559797145


<b>7.17 Correlation, Part I.</b><br>
What would be the correlation between the ages of husbands and wives if men always married woman who were:<br>
(a) 3 years younger than themselves?<br>
(b) 2 years older than themselves?<br>
(c) half as old as themselves?

In [7]:
mens_ages = [53, 55, 58, 56, 46, 37, 25]
womens_ages = [50, 52, 55, 53, 43, 34, 22]

# Do the same using plotly
trace = go.Scatter(
    x = mens_ages,
    y = womens_ages,
    mode = 'markers'
)

layout= go.Layout(
    title= "Husband's Age Minus 3",
    xaxis= dict(
        title= 'Husband Age',
    ),
    yaxis=dict(
        title= 'Wife Age',
    ),
    showlegend= False
)

fig= go.Figure(data=[trace], layout=layout)
iplot(fig)

In [8]:
print(stats.pearsonr(mens_ages, womens_ages)[0])

1.0


<b>7.37 - Husbands and Wives II</b><br>
The scatterplot above summarizes husbands’ and wives’ heights in a random sample of 170 married couples in Britain, where both partners’ ages are below 65 years.<br>
(a) Is there strong evidence that taller men marry taller women? State the hypotheses and include any information used to conduct the test.<br>
(b) Write the equation of the regression line for predicting wife’s height from husband’s height.<br>
(c) Interpret the slope and intercept in the context of the application.<br>
(d) Given that R2 = 0.09, what is the correlation of heights in this data set?<br>
(e) You meet a married man from Britain who is 5’9” (69 inches). What would you predict his
wife’s height to be? How reliable is this prediction?<br>
(f) You meet another married man from Britain who is 6’7” (79 inches). Would it be wise to use
the same linear model to predict his wife’s height? Why or why not?<br>

<b>a</b>$$ H_{0}: \beta_{1},\ \ H_{A}: \beta_{1} > 0$$
Although p-values assocaited with correlations are not the most reliable, the strong p-value associated with this correlation suggests that we can reject the null hypothesis that there is no slope.

<b>b</b>

In [9]:
model = ols("Ht_Wife ~ Ht_Husband", data=hw_df).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                Ht_Wife   R-squared:                       0.133
Model:                            OLS   Adj. R-squared:                  0.128
Method:                 Least Squares   F-statistic:                     30.17
Date:                Sun, 04 Mar 2018   Prob (F-statistic):           1.21e-07
Time:                        06:48:11   Log-Likelihood:                -1090.4
No. Observations:                 199   AIC:                             2185.
Df Residuals:                     197   BIC:                             2191.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   1028.5718    104.469      9.846      0.0

In [10]:
# The LinearRegression method expects 2-D numpy arrays for the data
husb_height_array = np.array(hw_df["Ht_Husband"]).reshape(-1,1)
wife_height_array = np.array(hw_df["Ht_Wife"]).reshape(-1,1)

regr = linear_model.LinearRegression()
regr.fit(husb_height_array, wife_height_array)
print("Coefficient(s): %.4f" % regr.coef_)
print("Intercept: %.4f" % regr.intercept_)

Coefficient(s): 0.3310
Intercept: 1028.5718



internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.



The equation is:<br>
$$H_{w} = 0.3310 \times H_{h} + 1028.57$$ 

<b>MarioKart</b><br>
The MarioKart dataset contains data from 141 online auctions, some of games in new condition, others in used condition.

In [11]:
mk_df = pd.read_csv("mariokart.csv")

In [12]:
mk_df.head()

Unnamed: 0,ID,duration,nBids,cond,startPr,shipPr,totalPr,shipSp,sellerRate,stockPhoto,wheels,title
0,150377000000.0,3,20,new,0.99,4.0,51.55,standard,1580,yes,1,~~ Wii MARIO KART &amp; WHEEL ~ NINTENDO Wii ~...
1,260483000000.0,7,13,used,0.99,3.99,37.04,firstClass,365,yes,1,Mariokart Wii Nintendo with wheel - Mario Kart...
2,320432000000.0,3,16,new,0.99,3.5,45.5,firstClass,998,no,1,Mario Kart Wii (Wii)
3,280405000000.0,3,18,new,0.99,0.0,44.0,standard,7,yes,1,Brand New Mario Kart Wii Comes with Wheel. Fre...
4,170392000000.0,1,20,new,0.01,0.0,71.0,media,820,yes,2,BRAND NEW NINTENDO 1 WII MARIO KART WITH 2 WHE...


In [13]:
mk_df["salePr"] = mk_df["totalPr"] - mk_df["shipPr"]

In [14]:
# Get the 'Beer' and 'Water' datasets from the data frame
new_prices = mk_df["salePr"][mk_df["cond"]=="new"]
used_prices = mk_df["salePr"][mk_df["cond"]=="used"]

# Declare the trace objects
trace0 = go.Box(
    y=used_prices,
    name="Used"
)
trace1 = go.Box(
    y=new_prices,
    name="New"
)

data = [trace0, trace1]

layout = go.Layout(
    title = "Mariokart prices",
    yaxis = dict(title="Selling Price ($)"),
    xaxis = dict(title="Condition")
)

# Generate the plot
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [15]:
mk_df["cond_var"] = ""
mk_df["cond_var"][mk_df["cond"] == "used"] = 0
mk_df["cond_var"][mk_df["cond"] == "new"] = 1
model = ols("salePr ~ cond_var", data=mk_df).fit()
print(model.summary())



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



                            OLS Regression Results                            
Dep. Variable:                 salePr   R-squared:                       0.349
Model:                            OLS   Adj. R-squared:                  0.345
Method:                 Least Squares   F-statistic:                     74.63
Date:                Sun, 04 Mar 2018   Prob (F-statistic):           1.20e-14
Time:                        06:48:12   Log-Likelihood:                -485.82
No. Observations:                 141   AIC:                             975.6
Df Residuals:                     139   BIC:                             981.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        39.7366      0.844     47.083



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

