In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
from pydataset import data
from env import get_db_url

# Classification
* Supervised machine learning
* Is this new observation A or B (or C, D, or E)?
* Categorical


- `Classifier`:
    - Binary = 2 outcomes = pass/fail
    - Multi-class = 2+ classes = school grade levels (1st-12th)
- `Algorithm` vs `Model`: General vs Specific
- `Feature`: A feature, aka input/independent variable, is an individual measurable property of a phenom being observed
- `Database` vs `Dataset`: DB has datasets

#### sklearn classification models
- Logistic Regression (sklearn.linear_model.LogisticRegression)
    - Predict binary outcome
- Decision Tree (sklearn.tree.DecisionTreeClassifier)
    - tree splitting data based on rules
- K-Nearest Neighbors (sklearn.neighbors.KNeighborsClassifier)
    - looks at plotted neighbors to id data
- Random Forest (sklearn.ensemble.RandomForestClassifier)
    - decision trees within a decision tree and goes with majority

## Data Acquisition


CSV
- `pd.read_csv(`url`)`
    - url = google_sheets`.replace(`'/edit#gid=', '/export?format=csv&gid='`)`


Clipboard (table in text)
- `pd.read_clipboard()`


MS Excel
- `pd.read_excel(`url`)`


SQL
- `pd.read_sql(`sql_query,sql_url`)`

Caching Data
- df`.to_csv(`new_filename.csv`)`

In [4]:
# import os

# def get_titanic_data():
#     filename = "titanic.csv"

#     if os.path.isfile(filename):
#         return pd.read_csv(filename)
#     else:
#         # read the SQL query into a dataframe
#         df = pd.read_sql('SELECT * FROM passengers', get_connection('titanic_db'))

#         # Write that dataframe to disk for later. Called "caching" the data for later.
#         df.to_file(filename)

#         # Return the dataframe to the calling code
#         return df 

### Exercises

Use a python module (pydata or seaborn datasets) containing datasets as a source for the iris data. Create a pandas dataframe, `df_iris`, from this data

In [6]:
df_iris = data('iris')

In [7]:
# print the first 3 rows
df_iris.head(3)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa


In [8]:
# print the number of rows and columns (shape)
df_iris.shape

(150, 5)

In [9]:
# print the column names
df_iris.columns

Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',
       'Species'],
      dtype='object')

In [10]:
# print the data type of each column
df_iris.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 1 to 150
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Sepal.Length  150 non-null    float64
 1   Sepal.Width   150 non-null    float64
 2   Petal.Length  150 non-null    float64
 3   Petal.Width   150 non-null    float64
 4   Species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 7.0+ KB


In [11]:
# print the summary statistics for each of the numeric variables
df_iris.describe()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


Read the data from this google sheet into a dataframe, `df_google`.


In [12]:
gsh = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit?usp=sharing'
url = gsh.replace('/edit?', '/export?format=csv&')
df_google = pd.read_csv(url)

In [13]:
# print the first 3 rows
df_google.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [14]:
# print the number of rows and columns
df_google.shape

(891, 12)

In [15]:
# print the column names
df_google.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [16]:
# print the data type of each column
df_google.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [17]:
# print the summary statistics for each of the numeric variables
df_google.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [19]:
# print the unique values for each of your categorical variables
for i in ['Survived', 'Pclass', 'Sex', 'SibSp','Parch', 'Embarked']:
    print(df_google[i].unique())

[0 1]
[3 1 2]
['male' 'female']
[1 0 3 4 2 5 8]
[0 1 2 5 3 4 6]
['S' 'C' 'Q' nan]


Download the previous exercise's file into an excel (File → Download → Microsoft Excel). Read the downloaded file into a dataframe named `df_excel`.

In [33]:
df_excel = pd.read_excel('train.xlsx')

In [34]:
# assign the first 100 rows to a new dataframe, df_excel_sample
df_excel_sample = df_excel.head(100)

In [35]:
# print the number of rows of your original dataframe
df_excel.shape

(891, 12)

In [36]:
# print the first 5 column names
df_excel.columns[:5]

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex'], dtype='object')

In [48]:
# print the column names that have a data type of object
df_excel.select_dtypes(include='object').columns

Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')

In [58]:
# compute the range for each of the numeric variables.
num_col = df_excel.select_dtypes(include='number').columns
for i in num_col:
    print(df_excel[i].min(), df_excel[i].max())

1 891
0 1
1 3
0.42 80.0
0 8
0 6
0.0 512.3292


## Data Preparation

Summarize data:
- `head()`, `describe()`, `info()`, `isnull()`, `value_counts()`, `shape`, ...
- `plt.hist()`, `plt.boxplot()`
- document takeaways (nulls, dtypes to change, outliers, ideas for features, etc.)

Clean data:
- `missing values`: drop columns/rows with too many missing values, fill with 0, take note of applicable columns/rows
- `outlier`: obs distant from other observations
    - ignore, drop rows, snap to selected max/min, create bins (cut, qcut)
- `data errors`: drop rows/obs with err, correct them
- `txt normalization`: correct/standardize text ('C' or 'c')
- `tidy data`: shape the data for modeling and exploring
    - 1 obs per row, 1 row per obs
    - take care of duplicates, aggregate, melt, reshape
- `create new` variables (z = x - y)
- `rename` columns
- `data-types`: need numeric data for model (dummy vars, factor vars, manual coding)
- `scale numeric data`: continuous vars have same weight, same units
    - linear scalars or non-linear scalars

Split data:
- `split` to train, validate, test sample dataframes
- `train`: in-sample, explore, impute mean, scale numeric data (max-min...), fit ml algorithms, test models
- `validate`: confirm top models don't overfit, test on unseen data
    - validate performance, pick best model
- `test`: out-of-sample, expected model performance on unseen data
    - only used on 1 model
