In [None]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

In [None]:
# Import essential models and functions from sklearn - for Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
# For Classification Tree
# Import Decision Tree Classifier model from Scikit-Learn
from sklearn.tree import DecisionTreeClassifier
# Import export_graphviz from sklearn.tree
from sklearn.tree import export_graphviz
# Import Confusion Matrix
from sklearn.metrics import confusion_matrix
# Render using graphviz
import graphviz

In [None]:
# Read CSV file
quizdata = pd.read_csv('file_name')
# Extract columns - UniVariate Stats
column1 = pd.DataFrame(quizdata['Column1'])
column2 = pd.DataFrame(quizdata['Column2'])
...
# Extract only the numeric data variables - MultiVariate Stats
numDF = pd.DataFrame(quizdata[["Column1", "Column2", "Column3"]])

# Access rows
df.iloc[row]
# Overall statistical description of the data and plot standard statistical distributions for each variable
quizdata.head()
print(quizdata.dtypes)
print("Data type : ", type(quizdata))
print("Data dims : ", quizdata.shape)
quizdata.describe() #.head() or .round()
quizdata.info()
# Plots

# Set up matplotlib figure with three subplots
f, axes = plt.subplots(2, 3, figsize=(24, 12))

# Plot the basic uni-variate figures for HP
sb.boxplot(df1, orient = "h", ax = axes[0,0])
sb.distplot(df1, kde = False, ax = axes[0,1])
sb.violinplot(df1, ax = axes[0,2])

# Plot the basic uni-variate figures for Attack
sb.boxplot(df2, orient = "h", ax = axes[1,0], color = 'g')
sb.distplot(df2, kde = False, ax = axes[1,1], color = 'g')
sb.violinplot(df2, ax = axes[1,2], color = 'g')
#set kde=True for Normal Distribution Curve
#change name of dfs
#change color
#change axes

# Draw the Boxplots of all variables
f, axes = plt.subplots(1, 1, figsize=(24, 12))
sb.boxplot(data = numDF, orient = "h")

# Draw the distributions of all variables
f, axes = plt.subplots(6, 2, figsize=(12, 24))

count = 0
for var in numDF:
    sb.distplot(numDF[var], ax = axes[count,0])
    sb.violinplot(numDF[var], ax = axes[count,1])
    count += 1

# Jointplot
sb.jointplot(x = df1, y = df2, height = 8)

# Draw pairs of variables against one another - MultiVariate Stats
sb.pairplot(data = numDF)

# Catplot
sb.catplot(y = "Generation", data = quizdata, kind = "count")

# Create a joint dataframe by concatenating Total and Legendary
jointDF = pd.concat([total_train, legnd_train], axis = 1, join_axes = [total_train.index])

## Joint Boxplot of Total Train against Legendary Train
f, axes = plt.subplots(1, 1, figsize=(18, 6))
sb.boxplot(x = "Total", y = "Legendary", data = jointDF, orient = "h")
## Joint Swarmplot of Total Train against Legendary Train
f, axes = plt.subplots(1, 1, figsize=(18, 6))
sb.swarmplot(x = "Total", y = "Legendary", data = jointDF, orient = "h")

In [None]:
# Value counts for unique data
# Generations in the Dataset
print("Number of Generations :", len(quizdata["Generation"].unique()))

# Pokemons in each Generation
print(quizdata["Generation"].value_counts())
sb.catplot(y = "Generation", data = quizdata, kind = "count")

In [None]:
# Create a joint dataframe by concatenating the two variables
jointDF = pd.concat([df1, df2], axis = 1, join_axes = [df1.index])

# Calculate the correlation between the two columns/variables
jointDF.corr()

# Heatmap - UniVariate Stats
sb.heatmap(jointDF.corr(), vmin = -1, vmax = 1, annot = True, fmt=".2f")

# Heatmap of the Correlation Matrix - MultiVariate Stats
f, axes = plt.subplots(1, 1, figsize=(12, 8))
sb.heatmap(numDF.corr(), vmin = -1, vmax = 1, annot = True, fmt = ".2f")

In [None]:
#Classification Tree

#PRICE against HEIGHT
X_train, X_test, y_train, y_test = train_test_split(CHANGE THIS UNIVARIATE, FIX THIS UNIVARIATE USU CATEGORY, test_size = CHANGE THIS)
# Check the sample sizes
print("Train Set :", X_train.shape)
print("Test Set  :", X_test.shape)

# Train the Decision Tree Classifier model to fit Price and Height
dectree = DecisionTreeClassifier(max_depth = CHANGE THIS)
dectree.fit(X_train, y_train)

# Predict Response corresponding to Predictors
y_train_pred = dectree.predict(X_train)
y_test_pred = dectree.predict(X_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Classification Accuracy \t:", dectree.score(X_train, y_train))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Classification Accuracy \t:", dectree.score(X_test, y_test))
print()

# Plot the Decision Tree
treedot = export_graphviz(dectree,                                      # the model
                          feature_names = X_train.columns,              # the features 
                          out_file = None,                              # output file
                          filled = True,                                # node colors
                          rounded = True,                               # make pretty
                          special_characters = True)                    # postscript

graphviz.Source(treedot)

In [None]:
# Create a Linear Regression object
linreg = LinearRegression()
# Train the Linear Regression model
linreg.fit(hp_train, total_train)
# Coefficients of the Linear Regression line
print('Intercept \t: b = ', linreg.intercept_)
print('Coefficients \t: a = ', linreg.coef_)

# Formula for the Regression line
regline_x = hp_train
regline_y = linreg.intercept_ + linreg.coef_ * hp_train

# Plot the Linear Regression line
f, axes = plt.subplots(1, 1, figsize=(16, 8))
plt.scatter(hp_train, total_train)
plt.plot(regline_x, regline_y, 'r-', linewidth = 3)
plt.show()

# Predict Total values corresponding to HP Train
total_train_pred = linreg.predict(hp_train)

# Plot the Linear Regression line
f, axes = plt.subplots(1, 1, figsize=(16, 8))
plt.scatter(hp_train, total_train)
plt.scatter(hp_train, total_train_pred, color = "r")
plt.show()

# Explained Variance (R^2)
print("Explained Variance (R^2) \t:", linreg.score(hp_train, total_train))

# Mean Squared Error (MSE)
def mean_sq_err(actual, predicted):
    '''Returns the Mean Squared Error of actual and predicted values'''
    return np.mean(np.square(np.array(actual) - np.array(predicted)))


mse = mean_sq_err(total_train, total_train_pred)
print("Mean Squared Error (MSE) \t:", mse)
print("Root Mean Squared Error (RMSE) \t:", np.sqrt(mse))

# Predict Total values corresponding to HP Test
total_test_pred = linreg.predict(hp_test)

# Plot the Predictions
f, axes = plt.subplots(1, 1, figsize=(16, 8))
plt.scatter(hp_test, total_test, color = "green")
plt.scatter(hp_test, total_test_pred, color = "red")
plt.show()

# MultiVariate Stats - set X to contain more than one 'Column'
X = pd.DataFrame(pkmndata[["HP", "Attack", "Defense"]])