Basic Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

Importing a dataframe from various sources

In [None]:
#creating a dataframe from a dictionary
canteens_dict = {"Name" : ["North Spine", "Koufu", "Canteen 9", "North Hill", "Canteen 11"],
                 "Stalls" : [20, 15, 10, 12, 8],
                 "Rating" : [4.5, 4.2, 4.0, 3.7, 4.2]
                }

canteens_df = pd.DataFrame(canteens_dict)

#importing a csv file into a dataframe
csv_data = pd.read_csv('data/somedata.csv', header = None)

#importing a txt file into a dataframe
txt_data = pd.read_table('data/somedata.txt', sep = "\s+", header = None)

#importing a xls file into a dataframe
xls_data = pd.read_excel('data/somedata.xlsx', sheet_name = 'Sheet1', header = None)

#importing a json file into a dataframe
json_data = pd.read_json('data/somedata.json')

#importing a html website into a dataframe
html_data = pd.read_html('http://www.imdb.com/title/tt0441773/fullcredits/?ref_=tt_ov_st_sm')

Extraction from a data frame

In [None]:
#extracting a single coloumn from a data frame by using name of coloumn
canteens_df["Name"]

#extracting a single row from a data frame
canteens_df.iloc[0]

#extracting the top n rows from a data frame (default n = 5)
canteens_df.head()
canteens_df.head(n=n)

#extracting a portion of rows from a dataframe
canteens_df[a:b] #extracts starting from row a and ends with b-1

#extracting a single column from a dataframe and creating a dataframe with that variable (only shows HP column)
hp = pd.DataFrame(pkmndata['HP'])

#extracting a single type of variable from a dataframe and creating a dataframe with that variable (only shows HP if HP == 1)
hp = pd.DataFrame(pkmndata["HP"] == 1)

#extracting a single type of variable with all other information included
dualtype_gen1 = dualtype_data[dualtype_data["Generation"] == 1]

Basic functions of a dataset

In [None]:
#checking the type of dataset
type(canteens_df)

#finding the shape of a data frame. this returns a tuple where the first number is NoOfRows and second number is NoOfColumns
canteens_df.shape

#finding the dtypes of each individual column
canteens_df.dtypes

#info gives more information about the dataset then dtypes
canteens_df.info()

#describe provides statistical data about the dataset. Describe can only be done on the numerical variables, which is why
#describe has less variables than info. some variables look numerical but are actually categorical variables encoded as numbers
canteens_df.describe()

#changing the type of data
houseCatData['MSSubClass'] = houseCatData['MSSubClass'].astype('category')

Step by step guide for importing from a html website

In [None]:
#when we import from a html website, we may get many dataframes. it is important to check which dataframe we are actually using
#eg
medal_html = pd.read_html('https://en.wikipedia.org/wiki/2016_Summer_Olympics_medal_table') #import the html website first
medal_html[0] #change index from 0, 1, 2 etc
medalTable = medal_html[1] #we realise we are supposed to use the second dataframe, so we extract it out

Functions for manipulating a dataframe

In [None]:
#setting column names of a dataframe
medalTable.columns = medalTable.iloc[0] #this sets the column names to the first row

#deleting a row in the dataframe (use index)
medalTable = medalTable.drop(0)

#obtaining unique variables in a column (returns a list)
medalTable["variable"].unique()
#obtaining number of each unique variables in a column
print(medalTable["variable"].value_counts())
#eg a column has the values [1, 2, 3, 3]. the first one returns [1, 2, 3] and the second one returns 1 1, 2 1, 3 3

#showing which rows have a null value
name = data[pkmn["Variable"].isnull]
#showing which rows have a non-null value
name = data[pkmn["Variable"].isnull == False]

#removing null values
data["variable"].dropna()

Uni-Variate Statistics

In [None]:
#showing the dataframe using a boxplot (summary statistics)
f, axes = plt.subplots(1, 1, figsize=(24, 4))
sb.boxplot(hp, orient = "h")

#showing the dataframe using a histogram with automatic bin sizes (complete distribution)
f, axes = plt.subplots(1, 1, figsize=(24, 12))
sb.distplot(hp, kde = False, color = "red")

#showing the dataframe using a KDE. a Kernel Density Estimate estimates the pdf of a variable
f, axes = plt.subplots(1, 1, figsize=(24, 12))
sb.distplot(hp, hist = False, color = "red")

#showing the dataframe with both histogram and a KDE
f, axes = plt.subplots(1, 1, figsize=(24, 12))
sb.distplot(hp, color = "red")

#showing the dataframe with a violin plot (boxplot combined with the KDE)
f, axes = plt.subplots(1, 1, figsize=(24, 12))
sb.violinplot(hp)

#showing multiple plots together
f, axes = plt.subplots(2, 3, figsize=(24, 12)) #first number(ie 2) is the number of rows u want, second number(ie 3) is columns

sb.boxplot(hp, orient = "h", ax = axes[0,0])
sb.distplot(hp, kde = False, ax = axes[0,1])
sb.violinplot(hp, ax = axes[0,2]) #plotting all plots together

#showing a catplot of countplot
sb.catplot(y = "Type 2", data = pkmndata, kind = "count", height = 8)

Bi-Variate Statistics

In [None]:
#jointplot
sb.jointplot(x = attack, y = hp, height = 8)

# Create a joint dataframe by concatenating the two variables
jointDF = pd.concat([attack, hp], axis = 1, join_axes = [attack.index])

# Calculate the correlation between the two columns/variables
jointDF.corr()

#visualising correlation matrix using heatmap
sb.heatmap(jointDF.corr(), vmin = -1, vmax = 1, annot = True, fmt=".2f")

#showing count using heatmap
f, axes = plt.subplots(1, 1, figsize=(20, 20))
sb.heatmap(dualtype_data.groupby(['Type 1', 'Type 2']).size().unstack(), 
           linewidths = 1, annot = True, annot_kws = {"size": 18}, cmap = "BuGn") #in this example, dualtype_data has to be first
#extracted out from main dataframe

#showing a catplot of countplot
sb.catplot(y = 'Type 1', data = pkmndata, col = 'Generation', kind = 'count', col_wrap = 2, height = 8)

#showing a boxplot with 2 variables
f, axes = plt.subplots(1, 1, figsize=(16, 8))
sb.boxplot(x = 'Neighborhood', y = 'SalePrice', data = houseCatSale)
plt.xticks(rotation=90);

Multi-Variate Statistics

In [None]:
# Extract only the numeric data variables
numDF = pd.DataFrame(pkmndata[["HP", "Attack", "Defense", "Sp. Atk", "Sp. Def", "Speed"]])

# Summary Statistics for all Variables
numDF.describe()

# Draw the Boxplots of all variables
f, axes = plt.subplots(1, 1, figsize=(24, 12))
sb.boxplot(data = numDF, orient = "h")

# Draw the distributions of all variables
f, axes = plt.subplots(6, 2, figsize=(12, 24))

count = 0
for var in numDF:
    sb.distplot(numDF[var], ax = axes[count,0])
    sb.violinplot(numDF[var], ax = axes[count,1])
    count += 1

# Calculate the complete  correlation matrix
numDF.corr()

# Heatmap of the Correlation Matrix
f, axes = plt.subplots(1, 1, figsize=(12, 8))
sb.heatmap(numDF.corr(), vmin = -1, vmax = 1, annot = True, fmt = ".2f")

# Draw pairs of variables against one another
sb.pairplot(data = numDF)

Linear Regression

In [None]:
#creating a train set of 600 samples and a test set with 200 samples

# Train Set : 600 samples
hp_train = pd.DataFrame(hp[:600])
total_train = pd.DataFrame(total[:600])

# Test Set : 200 samples
hp_test = pd.DataFrame(hp[-200:])
total_test = pd.DataFrame(total[-200:])

# Check the sample sizes
print("Train Set :", hp_train.shape, total_train.shape)
print("Test Set  :", hp_test.shape, total_test.shape)

Step by step guide to linear regression

In [None]:
# Import LinearRegression model from Scikit-Learn
from sklearn.linear_model import LinearRegression

# Create a Linear Regression object
linreg = LinearRegression()

# Train the Linear Regression model
linreg.fit(hp_train, total_train) #hp_train was used as predictor and total_train was used as response

#You have trained the model to fit the following formula: Regression Problem : Total =  aa   ××  HP +  bb 
#Check Intercept ( aa ) and Coefficient ( bb ) of the regression line
print('Intercept \t: b = ', linreg.intercept_)
print('Coefficients \t: a = ', linreg.coef_)

#Plot the regression line based on the coefficients-intercept form
# Formula for the Regression line
regline_x = hp_train
regline_y = linreg.intercept_ + linreg.coef_ * hp_train

# Plot the Linear Regression line
f, axes = plt.subplots(1, 1, figsize=(16, 8))
plt.scatter(hp_train, total_train)
plt.plot(regline_x, regline_y, 'r-', linewidth = 3)
plt.show()

#Plot the regression line by prediction using the model
# Predict Total values corresponding to HP Train
total_train_pred = linreg.predict(hp_train)

# Plot the Linear Regression line
f, axes = plt.subplots(1, 1, figsize=(16, 8))
plt.scatter(hp_train, total_train)
plt.scatter(hp_train, total_train_pred, color = "r")
plt.show()

#Check how good the predictions are on the Train Set. Metrics : Explained Variance and Mean Squared Error.
# Explained Variance (R^2)
print("Explained Variance (R^2) \t:", linreg.score(hp_train, total_train))

# Mean Squared Error (MSE)
def mean_sq_err(actual, predicted):
    '''Returns the Mean Squared Error of actual and predicted values'''
    return np.mean(np.square(np.array(actual) - np.array(predicted)))


mse = mean_sq_err(total_train, total_train_pred)
print("Mean Squared Error (MSE) \t:", mse)
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mse))

#Test the Linear Regression model linreg using the Test Set.
# Predict Total values corresponding to HP Test
total_test_pred = linreg.predict(hp_test)

# Plot the Predictions
f, axes = plt.subplots(1, 1, figsize=(16, 8))
plt.scatter(hp_test, total_test, color = "green")
plt.scatter(hp_test, total_test_pred, color = "red")
plt.show()

#Check how good the predictions are on the Train Set. Metrics : Explained Variance and Mean Squared Error.
# Explained Variance (R^2)
print("Explained Variance (R^2) \t:", linreg.score(hp_test, total_test))

# Mean Squared Error (MSE)
def mean_sq_err(actual, predicted):
    '''Returns the Mean Squared Error of actual and predicted values'''
    return np.mean(np.square(np.array(actual) - np.array(predicted)))

mse = mean_sq_err(total_test, total_test_pred)
print("Mean Squared Error (MSE) \t:", mse)
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mse))

Step by Step for decision tree making

In [None]:
#Extract the variables into a new dataframe
legnd = pd.DataFrame(pkmndata['Legendary'])  # Response
total = pd.DataFrame(pkmndata['Total'])      # Predictor

#Set up the classification problem with Train and Test datasets
# Train Set : 600 samples
total_train = pd.DataFrame(total[:600])
legnd_train = pd.DataFrame(legnd[:600])

# Test Set : 200 samples
total_test = pd.DataFrame(total[-200:])
legnd_test = pd.DataFrame(legnd[-200:])

# Import Decision Tree Classifier model from Scikit-Learn
from sklearn.tree import DecisionTreeClassifier

# Create a Decision Tree Classifier object
dectree = DecisionTreeClassifier(max_depth = 2)

# Train the Decision Tree Classifier model
dectree.fit(total_train, legnd_train)

# Import export_graphviz from sklearn.tree
from sklearn.tree import export_graphviz

# Export the Decision Tree as a dot object
treedot = export_graphviz(dectree,                                      # the model
                          feature_names = total_train.columns,          # the features 
                          out_file = None,                              # output file
                          filled = True,                                # node colors
                          rounded = True,                               # make pretty
                          special_characters = True)                    # postscript

# Render using graphviz
import graphviz
graphviz.Source(treedot)

#Check how good the predictions are on the Train Set. Metrics : Classification Accuracy and Confusion Matrix.
# Predict Legendary corresponding to Total Train
legnd_train_pred = dectree.predict(total_train)

# Print the Classification Accuracy
print("Classification Accuracy \t:", dectree.score(total_train, legnd_train))

# Plot the two-way Confusion Matrix
from sklearn.metrics import confusion_matrix
sb.heatmap(confusion_matrix(legnd_train, legnd_train_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18})

#Check how good the predictions are on the Test Set. Metrics : Classification Accuracy and Confusion Matrix.
# Predict Legendary corresponding to Total Test
legnd_test_pred = dectree.predict(total_test)

# Print the Classification Accuracy
print("Classification Accuracy \t:", dectree.score(total_test, legnd_test))

# Plot the two-way Confusion Matrix
from sklearn.metrics import confusion_matrix
sb.heatmap(confusion_matrix(legnd_test, legnd_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18})

#Split the Train and Test sets randomly, and perform Classification.
# Import essential models and functions from sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz
import graphviz

# Recall the Legendary-Total Dataset
legnd = pd.DataFrame(pkmndata['Legendary'])   # Response
total = pd.DataFrame(pkmndata['Total'])       # Predictor

# Split the Legendary-Total Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(total, legnd, test_size = 0.25)

# Decision Tree using Train Data
dectree = DecisionTreeClassifier(max_depth = 2)  # create the decision tree object
dectree.fit(X_train, y_train)                    # train the decision tree model

# Predict Legendary values corresponding to Total
y_train_pred = dectree.predict(X_train)
y_test_pred = dectree.predict(X_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Classification Accuracy \t:", dectree.score(X_train, y_train))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Classification Accuracy \t:", dectree.score(X_test, y_test))
print()

# Plot the Confusion Matrix for Train and Test
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sb.heatmap(confusion_matrix(y_train, y_train_pred),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
sb.heatmap(confusion_matrix(y_test, y_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])

# Plot the Decision Tree
treedot = export_graphviz(dectree,                                      # the model
                          feature_names = X_train.columns,              # the features 
                          out_file = None,                              # output file
                          filled = True,                                # node colors
                          rounded = True,                               # make pretty
                          special_characters = True)                    # postscript

graphviz.Source(treedot)