In [1]:
import numpy as np
import pandas as pd
%pylab inline

Populating the interactive namespace from numpy and matplotlib


## Exploratory Data Analysis

# Key EDA Questions

* What are the feature names and types?
* Are values missing?
* Which features are continuous and which are categorical?
* What is the distribution of the features?
* What is the distribution of the target?
* How do the features relate to the target?
* How do the variables relate to each other?

In [2]:
cars = pd.read_csv('data/cars_multivariate.csv')
cars.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [3]:
#can read directly from a website!
cars = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Auto.csv',sep= ',')
cars.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


Start with a very high level overview
- What do the first few rows look like?
- What are the different columns?
- Different data types
- Data summaries

In [4]:
cars.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [5]:
cars.shape

(397, 9)

In [6]:
cars.describe()

Unnamed: 0,mpg,cylinders,displacement,weight,acceleration,year,origin
count,397.0,397.0,397.0,397.0,397.0,397.0,397.0
mean,23.515869,5.458438,193.532746,2970.261965,15.555668,75.994962,1.574307
std,7.825804,1.701577,104.379583,847.904119,2.749995,3.690005,0.802549
min,9.0,3.0,68.0,1613.0,8.0,70.0,1.0
25%,17.5,4.0,104.0,2223.0,13.8,73.0,1.0
50%,23.0,4.0,146.0,2800.0,15.5,76.0,1.0
75%,29.0,8.0,262.0,3609.0,17.1,79.0,2.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,3.0


## Are there any values missing?

In [7]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 9 columns):
mpg             397 non-null float64
cylinders       397 non-null int64
displacement    397 non-null float64
horsepower      397 non-null object
weight          397 non-null int64
acceleration    397 non-null float64
year            397 non-null int64
origin          397 non-null int64
name            397 non-null object
dtypes: float64(3), int64(4), object(2)
memory usage: 28.0+ KB


Do the variable types make sense? Notice that HP has variable type 'object' - this doesn't seem right.

What good are the columns if you don't know what they mean?

How can make conclusions about reasonable values and check our data?

https://cran.r-project.org/web/packages/ISLR/ISLR.pdf

In [8]:
cars['horsepower'].tail()

392    86
393    52
394    84
395    79
396    82
Name: horsepower, dtype: object

All the values that we can see look like numbers. Why is horsepower an object then???

In [9]:
# Force convert horsepower to numeric
conv = cars[['horsepower']].convert_objects(convert_numeric=True)
conv

For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  


Unnamed: 0,horsepower
0,130.0
1,165.0
2,150.0
3,150.0
4,140.0
5,198.0
6,220.0
7,215.0
8,225.0
9,190.0


In [None]:
#Forcibly convert this column to numeric
pd.to_numeric(cars['horsepower'],errors='coerce')

In [None]:
cars.loc[pd.to_numeric(cars['horsepower'],errors='coerce').isnull()]

errors : {‘ignore’, ‘raise’, ‘coerce’}, default ‘raise’
If ‘raise’, then invalid parsing will raise an exception
If ‘coerce’, then invalid parsing will be set as NaN
If ‘ignore’, then invalid parsing will return the input

In [None]:
cars['horsepower'] = pd.to_numeric(cars['horsepower'],errors='coerce')

cars.info()

In [None]:
#reload the data but tell it what the na values will look like
cars = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Auto.csv',sep= ',',na_values='?')
cars.head()

In [None]:
# Let's just drop those rows
# notice the switch from isnull() to notnull()
cars = cars.loc[pd.to_numeric(cars['horsepower'],errors='coerce').notnull()]
cars.info()

In [None]:
# What about origin? model?
cars.origin.value_counts()

In [None]:
cars['year'].value_counts()

In [None]:
cars['origin'] = cars['origin'].astype(str)
cars['year'] = cars['year'].astype(str)
cars.info()

Our data set has 5-6 numeric variables and 3-4 categorical variables (cylinders is kind of a wild card). Sometimes it is good to keep track of these:

In [None]:
num_vars = ['mpg','cylinders','displacement','horsepower','weight','acceleration']
cat_vars = ['year', 'origin', 'name']

## More Exploration

### Univariate-Numeric Viz

In [None]:
# Histograms
ax = cars[num_vars].hist(bins=10)
plt.tight_layout()

In [None]:
# Boxplots
fig, axes = plt.subplots(3,2)
for ax, var in zip(axes.ravel(), num_vars):
    ax.boxplot(cars[var])
    ax.set_title(var)
plt.tight_layout()

In [None]:
axes.ravel().shape

In [None]:
axes.shape

In [None]:
import seaborn as sns

In [None]:
sns.boxplot(data=cars)

In [None]:
sns.boxplot(data=cars.drop('weight',1))

In [None]:
# Boxplots
fig, axes = plt.subplots(3,2)
for ax, var in zip(axes.ravel(), num_vars):
    sns.boxplot(y=var,data=cars,ax=ax)
    #sns.set_title(var)

In [None]:
cars.head()

### Univariate - Categorical

In [None]:
# Categorical
agg = cars.groupby('origin').apply(len)
agg.plot(kind='bar')

In [None]:
# Categorical vs Numeric
agg = cars.groupby('origin')['mpg'].mean()
agg.plot(kind='bar')

In [None]:
# Categorical vs categorical vs numeric
agg = cars.groupby(['origin','cylinders'])['mpg'].mean()
print agg
agg = agg.unstack(level='cylinders')
print agg
agg.plot(kind='bar')

### Numeric vs Numeric

In [None]:
# Scatterplot matrix
from pandas.tools.plotting import scatter_matrix
ax = scatter_matrix(cars[num_vars],figsize=(8,8),diagonal = 'kde')


In [None]:
# Use binning to see relationships more clearly
cars['binned_acceleration'] = pd.cut(cars.acceleration, bins=7)
agg = cars.groupby('binned_acceleration')['mpg'].mean()
agg.plot(kind='bar')

In [None]:
# Scatter plot by category
origins = cars['origin'].unique()
for origin in origins:
    plt.plot(cars.loc[cars.origin==origin,'acceleration'], 
             cars.loc[cars.origin==origin,'mpg'], 
             linestyle='',
             marker='o',
             alpha=.7,
             label=origin)
plt.xlabel('acceleration')
plt.ylabel('mpg')
plt.legend(numpoints=1)
    

In [None]:
# Categorical vs Categorical
pd.crosstab(cars['origin'], cars['year'])


Aha! Model means model year --- we probably should have left this as numeric

In [None]:
# Heat-map
import seaborn as sns

agg = cars.groupby(['origin','year'])['mpg'].mean()
ax = sns.heatmap(agg.unstack(level='year'), annot=True)
ax.set_title('MPG by origin and model year')

In [None]:
cars2 = pd.melt(cars.drop(['name','weight'],1), id_vars=["year", "origin"], var_name="Stat")

In [None]:
cars2.Stat.unique()

In [None]:
sns.swarmplot(x="Stat", y="value", data=cars2, hue="origin",split=True)

## Linear Regression

In [None]:
# Single variable regression
cars.plot('weight','mpg',kind='scatter')

Linear regression is just the fancy term for finding the line of best fit. If I was going to eyeball it from this data, I would draw the line from (1000,40) through (5500,5).

In other words, we are looking for the slope and intercept that defines a line that fits the data as well as possible.

'As well as possible' means that we are trying to minimize the mean-squared-error

In [None]:
# Make a guess at the line of best fit
first_point = [1000,45]
second_point = [5500, 0]

# Solve 
def get_line_equation(p1, p2):
    """
    Solve the system of equations:
    y1 = m*x1 + b
    y2 = m*x2 + b
    
    Input:
    p1: first point [x1, y1]
    p2: second point [x2, y2]
    
    returns: slope, intercept
    """
    X = [[p1[0], 1], [p2[0], 1]]
    y = [[p1[1]], [p2[1]]]
    soln = np.linalg.solve(X,y)
    return  soln[0][0], soln[1][0]

slope, intercept = get_line_equation(first_point, second_point)


# Plot the line along with the data
ax = cars.plot('weight','mpg',kind='scatter')
xx = np.linspace(1000, 5500, 100)
ax.plot(xx, xx*slope + intercept, color='red', lw=3)
ax.set_xlim([1000,5500])

How can we measure the error? The typical choice is to use mean squared error. The error for a given data point is the difference between the observed value and the predicted value
$$
MSE := \frac{1}{n} \sum_{i=1}^n (y_i - (mx_i + b))^2
$$


In [None]:
# Mean Squared Error

def mean_squared_error(X, y, m, b):
    """
    Compute the mean squared error, on the data (X,y), 
    of the model defined by slope m, and intercept b.
    """
    pred = X*m + b
    error = y - pred
    mse = np.mean(error**2)
    return mse

mean_squared_error(cars['weight'], cars['mpg'], slope, intercept)

## R-Squared

Mean squared error is a good error metric, but it is not comparable across different data sets. For this we use a scaled version called $R^2$. 
\begin{align}
    R^2 &:= 1 - \frac{SS_{res}}{SS_{tot}} \\
    &= 1 - \frac{\sum_{i=1}^n (y_i - (mx_i + b))^2}{\sum_{i=1}^n (y_i - \bar{y})^2}
\end{align}    

Where $SS_{res}$ is the sum of the squared residuals and $SS_{tot}$ is the total sum of squares. $R^2$ can be interpreted as the fraction of the variance in the data that is explained by the model.

In [None]:
# Calculate r-squared

def r_squared(X, y, m, b):
    """
    Compute the r-squared, on the data (X,y), 
    of the model defined by slope m, and intercept b.
    """
    pred = X*m + b
    resid = y - pred
    rsquared = 1 - np.sum(resid**2)/np.sum((y-y.mean())**2)
    return rsquared

r_squared(cars['weight'], cars['mpg'], slope, intercept)
    

## Ordinary least squares
It turns out that we can find the slope and intercept which *minimize* the mean squared error, using a procedure called ordinary least squares

Ordinary least squares is implemented in the statsmodels package. The advantage of this package is that we also have access to a number of *regression diagnostics.* 

In [None]:
import statsmodels.api as sm

# Choose the predictor and add a constant term
# (allow for an intercept)
X = pd.DataFrame({'weight' : cars['weight']})
X = sm.add_constant(X)
y = cars['mpg']

# Create a linear regression object
regressor = sm.OLS(y,X)
regressor = regressor.fit()
regressor.summary()

## Model Diagnostics

**coef** - The values of the coefficients in the model

**$P>|t|$** - The p-value of the null hypothesis that a specific parameter is zero.

**R-Squared** - Proportion of variance explained by the model. Measured on a scale from 0 (bad) to 1 (good)

**Prob (F-statistic)** - p-value of the F-statistic. This is the probability of the null hypothesis that *all parameters in the model are zero*

In [None]:
# Plot the line along with the data
slope = -.0076
intercept = 46.2165
ax = cars.plot('weight','mpg',kind='scatter')
xx = np.linspace(1000, 5500, 100)
ax.plot(xx, xx*slope + intercept, color='red', lw=3)
ax.set_xlim([1000,5500])

In [None]:
from pandas.tools.plotting import andrews_curves
#plt.xkcd()
data = pd.read_csv('data/iris.csv')
plt.figure()
andrews_curves(data, 'Name')