<a href="https://colab.research.google.com/github/ylfoo/HelloWorld/blob/main/Examples/Examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Examples

## Initialization

In [None]:
%matplotlib inline
from warnings import filterwarnings
filterwarnings('ignore')

## Collect Data

In [None]:
# Example 1
# Import the necessary module from scikit-learn
from sklearn.datasets import load_iris
# Load the Iris dataset
iris_data = load_iris()
# Extract features and target variables
X = iris_data.data
y = iris_data.target
# Display feature names and target names
print("Feature Names:", iris_data.feature_names)
print("Target Names:", iris_data.target_names)

In [None]:
# Example 2
# Import read_csv function from pandas
from pandas import read_csv
df = read_csv("heights_weights_genders.csv")
print(df.head())

X = df.drop(columns=['Gender'])
y = df['Gender']
print(df.shape, X.shape, y.shape)

In [None]:
# Example 3
# Import read_csv function from pandas
from pandas import read_csv
df = read_csv('heights_weights_genders.csv')
print(df.describe())  # print the statistical summary of the data
class_counts = df.groupby('Gender').size()
print(class_counts)  # print the class breakdown of the data

## Handling Missing Values

In [None]:
# Example 1
# Handling missing values by dropping data samples with missing values
import pandas as pd
import numpy as np
df = pd.DataFrame({'Age': [17, 23, 'x', 38, 54, 67, 32],
                  'Height': [160, 172, 150, 165, 163, 158, 175],
                  'Weight':[50, 68, 43, 52, 47, 49, 'x']})
df = df.replace({'x': np.nan}) # replace missing values (x) with NaN
print(df)
print(df.isnull().sum())
df = df.dropna() # drop rows with NaN
print(df)

In [None]:
# Example 2
# Handling missing values by imputing missing values with statistic
import pandas as pd
import numpy as np
df = pd.DataFrame({'Age': [17, 23, 'x', 38, 54, 67, 32],
                  'Height': [160, 172, 150, 165, 163, 158, 175],
                  'Weight':[50, 68, 43, 52, 47, 49, 'x']})
df = df.replace({'x': np.nan})
df['Age'] = df['Age'].fillna(df['Age'].median()) # replace NaN with median
df['Weight'] = df['Weight'].fillna(df['Weight'].mean()) # replace NaN with mean
print(df)

## Handling categorical data

In [None]:
import pandas as pd
df0 = pd.DataFrame({'year':[2015, 2017, 2013, 2018, 2020],
                  'make':['Toyota', 'Honda', 'Perodua', 'Hyundai', 'Toyota'],
                  'engine':[1.5, 1.8, 1.3, 1.6, 1.8],
                  'review':['moderate', 'good', 'poor', 'moderate', 'good']})
mapping = {'poor':1, 'moderate':2, 'good':3}
df0['review'] = df0['review'].map(mapping) # encode ordinal data
df0 = pd.get_dummies(df0) # encode nominal data
print(df0)

## Hold-out Validation

In [None]:
# Hold-out validation
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.neighbors import KNeighborsClassifier
df = read_csv("heights_weights_genders.csv")
X = df.drop(columns=['Gender'])
y = df['Gender']
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
model = KNeighborsClassifier()
model.fit(X_train, y_train)
result = model.score(X_test, y_test)
print(f"Accuracy: {result:.2%}")

## k-fold Cross Validation

In [None]:
# k-fold Cross-validation
from pandas import read_csv
from sklearn.model_selection import KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
df = read_csv("heights_weights_genders.csv")
X = df.drop(columns=['Gender'])
y = df['Gender']
model = KNeighborsClassifier()
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
results = cross_val_score(model, X, y, cv=kfold)
print(f"Accuracy: {results.mean():.2%} ({results.std():.2%})")

## Linear Regression

In [None]:
# Linear Regression
from sklearn.model_selection import train_test_split as split
from sklearn.linear_model import LinearRegression
from pandas import read_csv
df = read_csv("heights_weights_genders.csv")
X = df.drop(columns=['Weight(kg)', 'Gender'])
y = df['Weight(kg)']
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
lr = LinearRegression().fit(X_train, y_train)
print(f'R2 score: {lr.score(X_test, y_test):.2f}')

## k-NN Regression

In [None]:
# k-NN Regressor
from sklearn.model_selection import train_test_split as split
from sklearn.neighbors import KNeighborsRegressor
from pandas import read_csv
df = read_csv("heights_weights_genders.csv")
X = df.drop(columns=['Weight(kg)', 'Gender'])
y = df['Weight(kg)']
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
knn = KNeighborsRegressor().fit(X_train, y_train)
print(f'R2 score: {knn.score(X_test, y_test):.2f}')

## Logistic Regression

In [None]:
# Logistic Regression
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.linear_model import LogisticRegression
df = read_csv("heights_weights_genders.csv")
X = df.drop(columns=['Gender'])
y = df['Gender']
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
lgr = LogisticRegression().fit(X_train, y_train)
print(f'Accuracy: {lgr.score(X_test, y_test):.2%}')

## k-NN Classification

In [None]:
# k-NN Classification
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.neighbors import KNeighborsClassifier
df = read_csv("heights_weights_genders.csv")
X = df.drop(columns=['Gender'])
y = df['Gender']
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
knn = KNeighborsClassifier().fit(X_train, y_train)
print(f'Accuracy: {knn.score(X_test, y_test):.2%}')

In [None]:
# Decision Tree Classification
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

df = read_csv("heights_weights_genders.csv")
X = df.drop(columns=['Gender'])
y = df['Gender']
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
dtc = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)
print(f'Train accuracy: {dtc.score(X_train, y_train):.2%}')
print(f'Test accuracy: {dtc.score(X_test, y_test):.2%}')

# Plot tree
plt.figure(figsize=(10, 10))
plot_tree(dtc, feature_names=df.columns[:-1], class_names=['Female', 'Male'], rounded=True, filled=True)
plt.show()

In [None]:
# Decision Tree Classification with maximum depth of 2
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

df = read_csv("heights_weights_genders.csv")
X = df.drop(columns=['Gender'])
y = df['Gender']
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
dtc = DecisionTreeClassifier(random_state=42, max_depth=2).fit(X_train, y_train)
print(f'Train accuracy: {dtc.score(X_train, y_train):.2%}')
print(f'Test accuracy: {dtc.score(X_test, y_test):.2%}')

# Plot tree
plt.figure(figsize=(10, 10))
plot_tree(dtc, feature_names=df.columns[:-1], class_names=['Female', 'Male'], rounded=True, filled=True)
plt.show()

In [None]:
# Decision Tree Classification with maximum leaf nodes of 6
from pandas import read_csv
from sklearn.model_selection import train_test_split as split
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

df = read_csv("heights_weights_genders.csv")
X = df.drop(columns=['Gender'])
y = df['Gender']
X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)
dtc = DecisionTreeClassifier(random_state=42, max_leaf_nodes=6).fit(X_train, y_train)
print(f'Train accuracy: {dtc.score(X_train, y_train):.2%}')
print(f'Test accuracy: {dtc.score(X_test, y_test):.2%}')

# Plot tree
plt.figure(figsize=(10, 10))
plot_tree(dtc, feature_names=df.columns[:-1], class_names=['Female', 'Male'], rounded=True, filled=True)
plt.show()

## k-means Clustering

In [None]:
from pandas import read_csv
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
df = read_csv("heights_weights_genders.csv")
X = df.drop(columns=['Gender'])
km = KMeans(n_clusters=2).fit(X)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4))
fig.suptitle('k-means Clustering')
ax1.scatter(X.values[:, 0], X.values[:, 1])
ax1.set_title('Before clustering')
ax2.scatter(X.values[:, 0], X.values[:, 1], c=km.labels_)
ax2.set_title('After clustering')