In [None]:
# Python 2 & 3 Compatibility
from __future__ import print_function, division

import pandas as pd    #import pandas to read in out write out data
import statsmodels.api as sm    #import statsmodels for the stats models
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats    #import for stats functions

In [None]:
Data = pd.read_csv('Practice_data.csv')    #read in data
y = Data['Dep_Variable']    #extract dependent variable
X = Data.drop('Dep_Variable',1)    #remove it from the data and keep everything else
X['Ind_Variable_4**2'] = X['Ind_Variable_4']**2.0    #include a squared term on x_4
X['Ind_Variable_4**3'] = X['Ind_Variable_4']**3.0    #include a cubed term on x_4
X['Intercept'] = [1.0]*X.shape[0]    #include the intercept

In [None]:
#plot the relationships
#The density of y
ys = np.linspace(min(y),max(y),200)    #set the x range

#compute the density for the first data
density_y = scipy.stats.gaussian_kde(y)    #use the scipy density calculator
density_y.covariance_factor = lambda : .25    #set the covariance factor so that plots analogous to R's are plotted
density_y._compute_covariance()    #recompute using the new covariance factor

In [None]:
plt.figure(figsize = (20,8))    #initiate the plot
plt.plot(ys,density_y(ys),'b--',lw=4)    #generate the plot
plt.fill(ys,density_y(ys),'b',alpha=0.3)    #fill in the curves
plt.xlabel('y',fontsize=30)    #add an x label
plt.ylabel('density of y',fontsize=30)    #add a y label
plt.text(np.mean(y),max(density_y(ys))/2.0,r'$\widehat{y}$',fontsize=30) #LaTeX for mean
#plt.title('A Simple Density',fontsize=40)    #add a title to the plot
plt.grid(True)
#plt.savefig('Density_of_y.pdf')    #save the plot to file
plt.show()    #and then show the plot

In [None]:
#The variables vs y
plt.figure(figsize = (20,10))    #initiate the plot

plt.subplot(2,3,1)    #plot subplot 1,1 in the 3x3 area
plt.plot(X['Ind_Variable_1'],y,'bs',lw=4)    #generate the plot
plt.xlabel(r'$x_1$',fontsize=10)    #add an x label
plt.ylabel(r'$y$',fontsize=10)    #add a y label

In [None]:
plt.subplot(2,3,2)    #plot subplot 1,2 in the 3x3 area
plt.plot(X['Ind_Variable_2'],y,'bs',lw=4)    #generate the plot
plt.xlabel(r'$x_2$',fontsize=10)    #add an x label, r' ' is for raw string output
plt.ylabel(r'$y$',fontsize=10)    #add a y label

plt.subplot(2,3,3)    #plot subplot 1,3 in the 3x3 area
plt.plot(X['Ind_Variable_3'],y,'bs',lw=4)    #generate the plot
plt.xlabel(r'$x_3$',fontsize=10)    #add an x label
plt.ylabel(r'$y$',fontsize=10)    #add a y label

plt.subplot(2,3,4)    #plot subplot 2,1 in the 3x3 area
plt.plot(X['Ind_Variable_4'],y,'bs',lw=4)    #generate the plot
plt.xlabel(r'$x_4$',fontsize=10)    #add an x label
plt.ylabel(r'$y$',fontsize=10)    #add a y label

plt.subplot(2,3,5)    #plot subplot 2,2 in the 3x3 area
plt.plot(X['Ind_Variable_5'],y,'bs',lw=4)    #generate the plot
plt.xlabel(r'$x_5$',fontsize=10)    #add an x label
plt.ylabel(r'$y$',fontsize=10)    #add a y label

plt.subplot(2,3,6)    #plot subplot 2,3 in the 3x3 area
plt.plot(X['Ind_Variable_6'],y,'bs',lw=4)    #generate the plot
plt.xlabel(r'$x_6$',fontsize=10)    #add an x label
plt.ylabel(r'$y$',fontsize=10)    #add a y label

#plt.savefig('Variable_relationships.pdf')    #save the plot to file

In [None]:
#split into training and test sets:
mask = np.random.rand(X.shape[0]) < 0.75    #choose 3/4 of the data for training and the rest for test
X_train = X[mask]    #take the training independent variable data
X_test = X[~mask]    #and the test independent variable data
y_train = y[mask]    #take the training dependent variable data
y_test = y[~mask]    #and the training dependent variable data

#train the model
model = sm.OLS(y_train, X_train)    #define the statsmodels model object
results = model.fit()    #fit the model
results.summary()    #summarize

In [None]:
#consider in the test set
r2_adj = 1 - ((sum((y_test-results.predict(X_test))**2.0)/(len(X_test) - len(results.params) - 1))
                /(sum((y_test - np.mean(y_test))**2.0)/(len(X_test) - 1)))    #compute the adjusted r2 in the test set
r2_adj