In [1]:
# includes

import numpy as np
import pandas as pd
from bokeh.plotting import figure, show, output_file
from bokeh.models import Range1d

from statsmodels.sandbox.regression.predstd import wls_prediction_std

from sklearn.linear_model import Ridge, Lasso

import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix

from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Statsmodel APIs
import statsmodels.api as sm

In [2]:
# Bokeh output file
output_file("hw04.html", title = "analysis")


In [3]:
def projectile():
    # Read the csv file
    df = pd.read_csv("projectile.csv", header = 0)

    x = df['time'].values
    y = df['height'].values

    X = np.c_[x**2, x, np.ones(len(x))]

    res = sm.OLS(y, X).fit()

    xx = np.array([2.75, 3, 3.25, 3.5, 3.75, 4]);

    print res.summary()

    if True:
        p = figure(title = 'Blue = True; Red = OLS')
        p.circle(x, y, size = 8, color = 'blue')
        p.line(xx, res.predict(np.vander(xx, 3)), color = 'red')
        show(p)

# 3.4 secs is the answer

In [4]:
def healthcare():
    # read the csv file
    df = pd.read_csv("healthcare.csv", header = 0)

    x = df['cost'].values
    y = df['year'].values

    X = np.c_[x, np.ones(len(x))]

    res = sm.OLS(y, X).fit()

    m, b = res.params

    # cost of health care $2800 by
    pred_year = (m * 2800) + b;

    print pred_year;



In [5]:
def dataset():
    x = np.array([ -1, 0, 1,   2, 3, 5, 7, 9 ])
    y = np.array([ -1, 3, 2.5, 5, 4, 2, 5, 4 ])

    X = np.c_[x ** 5, x ** 4, x ** 3, x ** 2, x, np.ones(len(x))]

    res = sm.OLS(y, X).fit()

    ridge = Ridge(alpha = .2)
    ridge.fit(np.vander(x, 6), y)

    lasso = Lasso(alpha = 1)
    lasso.fit(np.vander(x, 6), y)

    p = figure(title = "Regularization")
    p.circle(x, y, size = 8, color = 'blue')

    xx = np.linspace(-1.25, 10, 100)

    p.line(xx, res.predict(np.vander(xx, 6)), color = 'red')
    p.line(xx, ridge.predict(np.vander(xx, 6)), color = 'green')
    p.line(xx, lasso.predict(np.vander(xx, 6)), color = 'cyan')

    # Ridge predict is better

    show(p)


In [6]:
def wine(mflag):
    # read the csv file
    df = pd.read_csv("redWineQuality.csv", header = 0, sep = ";")

    # normalize data
    df_n = pd.DataFrame()
    for col in df.columns:
        if (col == "quality"):
            df_n[col] = df[col]
        else:
            rng = df[col].max() - df[col].min()
            df_n[col] = ((df[col] - df[col].min()) / rng) * 10;

    if False:
        print df_n.head()
        print len(df.columns)

    # Scatter plot
    if False:
        scat = scatter_matrix(df_n, figsize = (18, 18))
        plt.show();

    # Some co-relation (not sure how to read the quality)
    # 11 factors and 1 target (quality), Data is not normalized

    features = df_n.drop("quality", axis = 1)
    target = df_n.quality

    if mflag:
        model_lr = LogisticRegression(C = 1)
        model = model_lr
    else:
        model_knn = KNeighborsClassifier(3)
        model = model_knn

    print cross_val_score(model, features, target, cv = 3).mean()
    return df_n, features, target, model;

In [7]:
def cwine():
    print "KNN"
    wine(False)

    print "Logistic"
    df, features, target, model = wine(True)

    feature_names = df.columns[:-1]

    if False:
        model_fit = model.fit(features, target);

        p = figure(title = "Model Cofficients")

        coefficients = model_fit.coef_.ravel()

        x = np.arange(len(feature_names))
        for val in x:
            p.quad(top = coefficients[val], bottom = 0, left = val + 0.2,
                    right = val + 0.8, color = [ 'red', 'orange', 'green', 'purple',
                                                'blue', 'cyan', 'magenta', 'red', 
                                                'orange', 'purple', 'blue'][val], legend = feature_names[val])
        p.y_range = Range1d(min(coefficients) - 0.1, max(coefficients) + 1.5)
        show(p)
    else:
        fld = "totSul"
        ridge = Ridge(alpha = .2)
        ridge.fit(np.vander(features[fld], 6), target)

        lasso = Lasso(alpha = 1)
        lasso.fit(np.vander(features[fld], 6), target)

        p = figure(title = "Regularization")
        p.circle(features[fld], target, size = 8, color = 'blue')

        xx = np.linspace(-1.25, 10, 100)

        p.line(xx, ridge.predict(np.vander(xx, 6)), color = 'green')
        p.line(xx, lasso.predict(np.vander(xx, 6)), color = 'cyan')

        show(p)


In [12]:
projectile()

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.890
Model:                            OLS   Adj. R-squared:                  0.781
Method:                 Least Squares   F-statistic:                     8.120
Date:                Mon, 24 Aug 2015   Prob (F-statistic):              0.110
Time:                        12:23:03   Log-Likelihood:                -9.7586
No. Observations:                   5   AIC:                             25.52
Df Residuals:                       2   BIC:                             24.35
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1           -10.5171      2.880     -3.652      0.0

In [13]:
healthcare()

2005.53244068


In [14]:
dataset()



In [15]:
cwine()

KNN
0.48161649247
Logistic
0.571062189138
