In [None]:
import math
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn import metrics

import statsmodels.api as sm

# load dataset
df = pd.read_csv('D:/iPrimed/Persistence/Data/weight_height.csv', index_col=0)

print(df)
print(df.corr(method='pearson')) # check the correlation matrix

x = df['Height'].values.reshape(-1,1) # we need a 2D arrary since there could be multiple independent variables. But in this case we only have one independent variable since this is a simple linear regression model. So we need to reshape the 1D array from DataFrame into a 2D array but the size of the second dimension is of course 1
# -1 in reshape is used to tell numpy to determine the dimension of the input
y = df['Weight'].values

lr = LinearRegression(fit_intercept = True)
lr.fit(x, y) # actual fitting of the model, note we are using 100% of the dataset for training
y_pred = lr.predict(x)

print('Coefficients = ', lr.coef_)
print('Intercept = ', lr.intercept_)
print('R^2 = ', lr.score(x, y)) # larger value, i.e., close to 1.0, is better
print('Root MSE = ', math.sqrt(metrics.mean_squared_error(y_pred , y))) # smaller value, i.e., close to 0.0, is better

x2 = sm.add_constant(x)
ols = sm.OLS(y, x2)
est = ols.fit()
print(est.summary()) # this gives you a very nicely and comprehensive formatted report

# plot the regression line
plt.scatter(x, y,  color='black')
plt.plot(x, y_pred, color='blue', linewidth=3)
plt.title('Linear Regression Line')
plt.xlabel('Height')
plt.ylabel('Weight')
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression



df = pd.read_csv('D:/iPrimed/Persistence/Data/weight_height.csv', index_col=0)

x = df['Height'].values.reshape(-1,1)
y = df['Weight'].values

lr = LinearRegression(fit_intercept = True)
lr.fit(x, y)
y_pred = lr.predict(x)

residuals = y - y_pred

print(y)
print(y_pred)
print(residuals)

plt.title('Residuals Analysis')
plt.scatter(y_pred, residuals,  color='black')
plt.xlabel('Predicted Value')
plt.ylabel('Residual')
plt.plot([52, 73], [0, 0], color='blue', linestyle='-', linewidth=1)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import scipy.stats as stats



df = pd.read_csv('D:/iPrimed/Persistence/Data/weight_height.csv', index_col=0)

x = df['Height'].values.reshape(-1,1)
y = df['Weight'].values

lr = LinearRegression(fit_intercept = True)
lr.fit(x, y)
y_pred = lr.predict(x)

residuals = y - y_pred

print(y)
print(y_pred)
print(residuals)

stats.probplot(residuals, dist="norm", plot=plt)
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import scipy.stats as stats



df = pd.read_csv('D:/iPrimed/Persistence/Data/weight_height.csv', index_col=0)

plt.figure(0)
df['Height'].hist()
plt.figure(1)
stats.probplot(df['Height'], dist="norm", plot=plt)
plt.figure(2)
df['Weight'].hist()
plt.figure(3)
stats.probplot(df['Weight'], dist="norm", plot=plt)
plt.show()

In [None]:
import math
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import scipy.stats as stats

import statsmodels.api as sm

df = pd.read_csv('D:/iPrimed/Persistence/Data/colleges.csv', index_col=0)
df = df.drop('Type', axis=1)
print(df)
print(df.corr(method='pearson'))
print()

independent_variables = df.drop('GraduationPercent', axis=1)

x = independent_variables.values
y = df['GraduationPercent'].values

lr = LinearRegression(fit_intercept = True)
lr.fit(x, y)
y_pred = lr.predict(x)

print('Coefficients = ', lr.coef_)
print('Intercept = ', lr.intercept_)
print('R^2 = ', lr.score(x, y))
print('Root MSE = ', math.sqrt(metrics.mean_squared_error(y_pred , y)))

x2 = sm.add_constant(x)
ols = sm.OLS(y, x2)
est = ols.fit()
print(est.summary())

residuals = y - y_pred
plt.figure(0)
plt.title('Residuals Analysis')
plt.scatter(y_pred, residuals,  color='black')
plt.xlabel('Predicted Value')
plt.ylabel('Residual')
plt.plot([69, 95], [0, 0], color='blue', linestyle='-', linewidth=1)

plt.figure(1)
stats.probplot(residuals, dist="norm", plot=plt)

plt.show()


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import feature_selection


import statsmodels.api as sm

df = pd.read_csv('D:/iPrimed/Persistence/Data/colleges.csv', index_col=0)
df = df.drop('Type', axis=1)

independent_variables = df.drop('GraduationPercent', axis=1)

x = independent_variables.values
y = df['GraduationPercent'].values

# Returns F-scores of features and p-values of F-scores.
print(feature_selection.f_regression(x, y, center=True))

print()

estimator = LinearRegression(fit_intercept = True)
selector = feature_selection.RFE(estimator, 2, step=1)
selector = selector.fit(x, y)
#  Selected (i.e., estimated best) features are assigned rank 1.
print(selector.ranking_)


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import feature_selection


import statsmodels.api as sm

df = pd.read_csv('D:/iPrimed/Persistence/Data/newsprint.csv')

independent_variables = df.drop('Y', axis=1)

x = independent_variables.values
y = df['Y'].values

print(feature_selection.f_regression(x, y, center=True))

print()

estimator = LinearRegression(fit_intercept = True)
selector = feature_selection.RFE(estimator, 2, step=1)
selector = selector.fit(x, y)
print(selector.ranking_)


In [None]:
import math
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import statsmodels.api as sm

df = pd.read_csv('D:/iPrimed/Persistence/Data/colleges.csv')
df['University'] = 0

for i in range(0, len(df.index)):
	if df.ix[i]['Type'] == 'University':
		df.loc[i,'University'] = 1

df = df.drop('School', axis=1)
df = df.drop('Type', axis=1)

print(df)

independent_variables = df.drop('GraduationPercent', axis=1)

x = independent_variables.values
y = df['GraduationPercent'].values

lr = LinearRegression(fit_intercept = True)
lr.fit(x, y)
y_pred = lr.predict(x)

print('Coefficients = ', lr.coef_)
print('Intercept = ', lr.intercept_)
print('R^2 = ', lr.score(x, y))
print('Root MSE = ', math.sqrt(metrics.mean_squared_error(y_pred , y)))

x2 = sm.add_constant(x)
ols = sm.OLS(y, x2)
est = ols.fit()
print(est.summary())


Bike Sharing Dataset

This question is based on the Bike Sharing dataset taken from the UCI Machine Learning
Repository (originally from http://capitalbikeshare.com/system-data) –
http://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset.
 
The original source of the dataset is attributed to: 
Fanaee-T, H., Gama, J., “Event Labeling Combining Ensemble Detectors and
Background Knowledge”, Progress in Artificial Intelligence, 2013, pp. 1-15, Springer
Berlin Heidelberg. 

The dataset is concerned with the domain of bike sharing systems. Bike sharing systems are
new generation of traditional bike rentals where whole process from membership, rental and
return back has become automatic. Through these systems, user is able to easily rent a bike
from a particular position and return back at another position. Currently, there are about over
500 bike-sharing programs around the world which is composed of over 500 thousands
bicycles. Today, there exists great interest in these systems due to their important role in
traffic, environmental and health issues. 

Apart from interesting real world applications of bike sharing systems, the characteristics of
data being generated by these systems make them attractive for research. Opposed to other
transport services such as bus or subway, the duration of travel, departure and arrival position
is explicitly recorded in these systems. This feature turns bike sharing system into a virtual
sensor network that can be used for sensing mobility in the city. Hence, it is expected that
most of important events in the city could be detected via monitoring these data.

The dataset comes in two versions. In the first version, the rental bikes records are organized
by day. In the second version, the rental bikes records are organized by hour of day. In
general, you may think of one record in the first version for a particular day as being divided
into 24 records in the second version, i.e., one for each hour of day. However, if a particular
hour does not have a single bike being rented out; it will be excluded from the dataset. In
other words, the first version of the dataset contains 731 observations but the second version
of the dataset contains less than 731 x 24 = 17,544 observations. In fact, the second version of
the dataset only has 17,379 observations. It is deemed that there is no missing data. This 
assignment is based on the second version of the dataset. 

In [None]:
import math
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import scipy.stats as stats
from sklearn import feature_selection
from sklearn.preprocessing import normalize

import statsmodels.api as sm

df = pd.read_csv('D:/iPrimed/Persistence/Data/bike-sharing-hour.csv')

df = df.drop('instant',axis=1)
df = df.drop('dteday',axis=1)
#df = df.drop('season',axis=1)
#df = df.drop('yr',axis=1)
#df = df.drop('mnth',axis=1)
season = pd.get_dummies(df['season'])

weathersit = pd.get_dummies(df['weathersit'],prefix='weather')

df = df.drop('season',axis=1)
df = df.drop('weathersit',axis=1)

df = pd.concat([df,season],axis=1)
df = pd.concat([df,weathersit],axis=1)

print(df)
print(df.corr(method='pearson'))
print()

independent_variables = df.drop('cnt', axis=1)

x = independent_variables.values
x = normalize(x)
y = df['cnt'].values
print(feature_selection.f_regression(x, y, center=True))

estimator = LinearRegression(fit_intercept = True)
selector = feature_selection.RFE(estimator, 2, step=1)
selector = selector.fit(x, y)
#  Selected (i.e., estimated best) features are assigned rank 1.
print(selector.ranking_)

x2 = sm.add_constant(x)
ols = sm.OLS(y, x2)
est = ols.fit()
print(est.summary())


lr = LinearRegression(fit_intercept = True)
lr.fit(x, y)
y_pred = lr.predict(x)

residuals = y - y_pred
plt.figure(0)
plt.title('Residuals Analysis')
plt.scatter(y_pred, residuals,  color='black')
plt.xlabel('Predicted Value')
plt.ylabel('Residual')
plt.plot([0, 50], [0, 0], color='blue', linestyle='-', linewidth=1)

plt.figure(1)
stats.probplot(residuals, dist="norm", plot=plt)

plt.show()



Boston Housing Dataset

Perform a multiple linear regression analysis on the Boston housing dataset and report your
results and findings.

In [None]:
import math
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import scipy.stats as stats
from sklearn import feature_selection

import statsmodels.api as sm

df = pd.read_csv('D:/iPrimed/Persistence/Data/BostonHousing.csv')

print(df)
print(df.corr(method='pearson'))
print()

independent_variables = df.drop('MEDV', axis=1)


df = df.drop('CRIM',axis=1)
df = df.drop('ZN',axis=1)
df = df.drop('INDUS',axis=1)
df = df.drop('AGE',axis=1)
df = df.drop('RAD',axis=1)
df = df.drop('TAX',axis=1)
df = df.drop('LSTAT',axis=1)
df = df.drop('B',axis=1)


x = independent_variables.values
y = df['MEDV'].values
print(feature_selection.f_regression(x, y, center=True))

estimator = LinearRegression(fit_intercept = True)
selector = feature_selection.RFE(estimator, 2, step=1)
selector = selector.fit(x, y)
#  Selected (i.e., estimated best) features are assigned rank 1.
print(selector.ranking_)

a = selector.ranking_

x2 = sm.add_constant(x)
ols = sm.OLS(y, x2)
est = ols.fit()
print(est.summary())

lr = LinearRegression(fit_intercept = True)
lr.fit(x, y)
y_pred = lr.predict(x)

residuals = y - y_pred
plt.figure(0)
plt.title('Residuals Analysis')
plt.scatter(y_pred, residuals,  color='black')
plt.xlabel('Predicted Value')
plt.ylabel('Residual')
plt.plot([0, 50], [0, 0], color='blue', linestyle='-', linewidth=1)

plt.figure(1)
stats.probplot(residuals, dist="norm", plot=plt)

plt.show()

'''
Higher the F score , better, that is more likely it is the independant variable
'''
'''
print(independent_variables.columns.values)
independent_variables2 = []

for i in range(len(independent_variables.columns.values)):
    if a[i] < 4:
        independent_variables2[independent_variables.columns.values[i]] = independent_variables[independent_variables.columns.values[i]]
print('Independant varaibles 2')
print(independent_variables2)
'''

Logistic Regression 

In [None]:
#Importing packages

import numpy as np   #Importing Numpy

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

import sklearn as sklearn

# Importing the dataset
logistic_dataset = pd.read_csv('D:/iPrimed/Persistence/Data/Social_Network_Ads.csv')

logistic_X = logistic_dataset.iloc[:, [2, 3]].values #Causes

logistic_y = logistic_dataset.iloc[:, 4].values #Effect

#Descriptive statistics
type(logistic_dataset)
type(logistic_dataset.Gender)
type(logistic_dataset.Age)
type(logistic_dataset.EstimatedSalary)
type(logistic_dataset.Purchased)

logistic_dataset.dtypes
logistic_dataset.Age.mean()
logistic_dataset.describe()

#Exploratory Analysis
sns.pairplot(logistic_dataset)
plt.scatter(logistic_dataset.Age,logistic_dataset.EstimatedSalary)
plt.scatter(logistic_dataset.EstimatedSalary,logistic_dataset.Purchased)


# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

logistic_X_train, logistic_X_test, logistic_y_train, logistic_y_test = train_test_split(logistic_X, logistic_y, test_size = 0.2, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(logistic_X_train)
X_test = sc.transform(logistic_X_test)

# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression

logistic_classifier = LogisticRegression(random_state = 0)

logistic_classifier.fit(logistic_X_train, logistic_y_train)

logistic_classifier


logistic_classifier.coef_

# Predicting the Test set results
logistic_y_pred = logistic_classifier.predict(logistic_X_test)

Y_Test=pd.DataFrame(logistic_y_test)
Y_Pred=pd.DataFrame(logistic_y_pred)
comparison=[Y_Test,Y_Pred]
pd.concat(comparison,axis=1)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
logistic_cm = confusion_matrix(logistic_y_test, 
                               logistic_y_pred)

cross_tab=pd.crosstab(logistic_y_pred,logistic_y_test)
print("Cross Tabulation")
print(cross_tab)

Decision Tree Classification

In [None]:
#Importing Libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

#Reading data
data = pd.read_csv(r"D:/iPrimed/Persistence/Data/Social_Network_Ads.csv")

#Removing User ID, as it's an incremental value that doesn't add to our classification prediction
data.pop('User ID')

#Replacing categorical values to numericals
data['Gender'].replace(['Male','Female'],[1,0],inplace=True)

#Using features: Gender, Age for prediction of Purchased label
feature_cols = ['Gender', 'Age']
X = data[feature_cols] # Features
y = data.Purchased # Target variable

#Divide the data into train and test split. 
#The following code will split the data-set into 70% training data and 30% of testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

#Train the model with the help of DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)

#At last we need to make predictions. It can be done with the help of following script −
y_pred = clf.predict(X_test)

#Next, we can get the accuracy score, confusion matrix and classification report as follows −
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, y_pred)

cross_tab=pd.crosstab(y_pred,y_test)
print("Cross Tabulation")
print(cross_tab)

result1 = classification_report(y_test, y_pred)
print("Classification Report:",)
print (result1)

result2 = accuracy_score(y_test,y_pred)
print("Accuracy:",result2)

#Visualizing Decision Tree
#The above decision tree can be visualized with the help of following code −
#from sklearn.tree import export_graphviz
#from sklearn.externals.six import StringIO
#from IPython.display import Image
#import pydotplus
#dot_data = StringIO()
#export_graphviz(clf, out_file=dot_data, filled=True, rounded=True,
#   special_characters=True,feature_names = feature_cols,class_names=['0','1'])
#graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
#graph.write_png('Purchased.png')
#Image(graph.create_png())

Unsupervised Learning - Clustering

In [None]:
#Market Segmentation
 
#Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sklearn
sns.set()
from sklearn.cluster import KMeans
from sklearn import preprocessing
 
data = pd.read_csv('D:/iPrimed/Persistence/Data/satisfaction.csv')


 
data.head()
data.tail()
data.dtypes
type(data)
data.columns
data.axes
data.size
data.ndim
data.shape
data.describe()
data.mean()
data.std()
data.groupby('Gender').mean()
data_men = data[data['Gender']=='Men']
data_women = data[data['Gender']=='Women']
data_men.head()
data_women.head()
data_men.describe()
data_women.describe()
data.corr()
data_men.corr()
data_women.corr()
 

plt.scatter(data['Satisfaction'],data['Loyalty'])
plt.xlabel('Satisfaction')
plt.ylabel('Loyalty')

plt.scatter(data_men['Satisfaction'],data_men['Loyalty'])
plt.xlabel('Satisfaction of Men')
plt.ylabel('Loyalty of Men')

plt.scatter(data_women['Satisfaction'],data_women['Loyalty'])
plt.xlabel('Satisfaction of Women')
plt.ylabel('Loyalty of Women')

x = data.copy()
x = data.iloc[:,:2]
kmeans = KMeans(5)
kmeans.fit(x)
clusters = x.copy()
clusters['predict']=kmeans.fit_predict(x)

plt.scatter(clusters['Satisfaction'], clusters['Loyalty'], c=clusters['predict'], cmap='rainbow')

plt.xlabel('Satisfaction')
plt.ylabel('Loyalty')

report = pd.concat([data,clusters['predict']],axis=1)

report.to_csv('report.csv')
