<h3>Example 1: Data Preprocessing</h3>

In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris

# Load iris data for classification
iris = load_iris()
X_iris = iris.data
y_iris = iris.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size=0.2, random_state=42)


#Before Standardize features
print("First 5 rows of the training data:")
print(X_train[:5])

# Standardize features (zero mean, unit variance)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n\n First 5 rows of the scaled training data:")
print(X_train_scaled[:5])


First 5 rows of the training data:
[[4.6 3.6 1.  0.2]
 [5.7 4.4 1.5 0.4]
 [6.7 3.1 4.4 1.4]
 [4.8 3.4 1.6 0.2]
 [4.4 3.2 1.3 0.2]]


 First 5 rows of the scaled training data:
[[-1.47393679  1.20365799 -1.56253475 -1.31260282]
 [-0.13307079  2.99237573 -1.27600637 -1.04563275]
 [ 1.08589829  0.08570939  0.38585821  0.28921757]
 [-1.23014297  0.75647855 -1.2187007  -1.31260282]
 [-1.7177306   0.30929911 -1.39061772 -1.31260282]]


<h3>Example 2: Simple Imputer (Handling Missing Values)</h3>

In [14]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

# Creating a sample DataFrame with missing values
data = {
    'Feature1': [1, 2, np.nan, 4],
    'Feature2': [np.nan, 2, 3, 4]
}
df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:\n")
print(df)

# Initialize the SimpleImputer with a strategy to fill missing values
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on the DataFrame and transform it
df_imputed = imputer.fit_transform(df)

# Convert the result back to a DataFrame
df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

# Display the DataFrame after imputing missing values
print("\nDataFrame after imputing missing values:\n")
print(df_imputed)


Original DataFrame:

   Feature1  Feature2
0       1.0       NaN
1       2.0       2.0
2       NaN       3.0
3       4.0       4.0

DataFrame after imputing missing values:

   Feature1  Feature2
0  1.000000       3.0
1  2.000000       2.0
2  2.333333       3.0
3  4.000000       4.0


<h3>Example 3: OneHotEncoder and LabelEncoder</h3>

<b>OneHotEncoder</b> converts categorical variables into a format vector that can be provided to machine learning algorithms to do a better job in prediction. One of the main advantages of OneHotEncoder is that it avoids introducing ordinal relationships between categorical variables that do not have any intrinsic order. By converting categories into a binary vector, it ensures that each category is equally distinct and independent.

<b>LabelEncoder</b> converts categorical labels into numeric form.

In [23]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Creating a sample DataFrame with categorical values
data = {
    'Category': ['A', 'B', 'A', 'C'],
    'Value': [10, 20, 10, 30]
}
df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:")
print(df, "\n\n")

# OneHotEncoder for categorical variables
encoder = OneHotEncoder(sparse_output=False)
encoded = encoder.fit_transform(df[['Category']])

print("encoded")
print(encoded,"\n\n")

# Convert the result to a DataFrame
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['Category']))

# Display the OneHotEncoded DataFrame
print("OneHotEncoded DataFrame:")
print(encoded_df, "\n\n")

# Concatenate the encoded columns with the original DataFrame
df_onehot = pd.concat([df, encoded_df], axis=1).drop('Category', axis=1)

# Display the DataFrame after OneHotEncoding
print("DataFrame after OneHotEncoding:")
print(df_onehot, "\n\n")

# Creating a sample array with categorical labels
labels = ['A', 'B', 'A', 'C']

# LabelEncoder for labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)

# Display the encoded labels
print("Encoded labels:")
print(y_encoded, "\n\n")



Original DataFrame:
  Category  Value
0        A     10
1        B     20
2        A     10
3        C     30 


encoded
[[1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]] 


['Category_A' 'Category_B' 'Category_C']
OneHotEncoded DataFrame:
   Category_A  Category_B  Category_C
0         1.0         0.0         0.0
1         0.0         1.0         0.0
2         1.0         0.0         0.0
3         0.0         0.0         1.0 


DataFrame after OneHotEncoding:
   Value  Category_A  Category_B  Category_C
0     10         1.0         0.0         0.0
1     20         0.0         1.0         0.0
2     10         1.0         0.0         0.0
3     30         0.0         0.0         1.0 


Encoded labels:
[0 1 0 2] 




### Example 4: Pipeline for Preprocessing and Model Training

In this example, we will create a preprocessing pipeline that includes handling missing values and scaling features. We will then add a classifier to this pipeline and train the model.

#### Steps:
1. Create a preprocessing pipeline for numeric features.
2. Combine preprocessing and classifier into one pipeline.
3. Train the model using the pipeline.
4. Predict the test set results.

In [26]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

# Load iris data for classification
iris = load_iris()
X_iris = iris.data
y_iris = iris.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_iris, y_iris, test_size=0.2, random_state=42)

# Creating a preprocessing pipeline
# Define a pipeline for numeric features: imputing missing values with median and then scaling the features
numeric_features = [0, 1, 2, 3]
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Step to handle missing values by replacing them with the median value
    ('scaler', StandardScaler())  # Step to standardize features by removing the mean and scaling to unit variance
])

# Combine preprocessing steps using ColumnTransformer to apply the numeric transformer to the numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('my_numeric_transformer', numeric_transformer, numeric_features)  # Apply the numeric transformer to the specified numeric features
    ])

# Creating a complete pipeline that includes both the preprocessor and the classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # First apply the preprocessor to the data
    ('classifier', RandomForestClassifier(random_state=42))  # Then apply the classifier to the preprocessed data
])

# Training the model
pipeline.fit(X_train, y_train)

# Predicting the test set results
y_pred = pipeline.predict(X_test)

# Displaying the predicted results
print("Predicted labels:")
print(y_pred, "\n\n")


Predicted labels:
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0] 




### Example 5: Comprehensive Pipeline Example ###


In [27]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.datasets import load_iris

# Load iris data for classification
iris = load_iris()
X_iris = iris.data
y_iris = iris.target

# Creating a DataFrame to introduce categorical data for encoding step
df = pd.DataFrame(X_iris, columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])
df['category'] = np.random.choice(['A', 'B', 'C'], size=df.shape[0])  # Adding a categorical feature

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df, y_iris, test_size=0.2, random_state=42)

# Numeric and categorical features
numeric_features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
categorical_features = ['category']

# Creating preprocessing pipelines for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Step to handle missing values by replacing them with the median value
    ('scaler', StandardScaler()),  # Step to standardize features by removing the mean and scaling to unit variance
    ('poly', PolynomialFeatures(degree=2, include_bias=False))  # Step to generate polynomial and interaction features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # Step to handle missing values by replacing them with a constant value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Step to encode categorical variables as binary vectors
])

# Combine preprocessing steps using ColumnTransformer to apply the numeric and categorical transformers to the respective features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),  # Apply the numeric transformer to the specified numeric features
        ('cat', categorical_transformer, categorical_features)  # Apply the categorical transformer to the specified categorical features
    ])

# Creating a complete pipeline that includes preprocessing, PCA, feature selection, and classification
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Apply the preprocessor to the data
    ('pca', PCA(n_components=5)),  # Reduce the number of features to 5 principal components
    ('select', SelectKBest(score_func=f_classif, k=5)),  # Select the top 5 features based on ANOVA F-value
    ('classifier', RandomForestClassifier(random_state=42))  # Apply the RandomForestClassifier to the preprocessed data
])

# Training the model
pipeline.fit(X_train, y_train)

# Predicting the test set results
y_pred = pipeline.predict(X_test)

# Displaying the predicted results
print("Predicted labels:")
print(y_pred, "\n\n")

# Displaying the actual labels
print("Actual labels:")
print(y_test, "\n\n")



Predicted labels:
[1 0 2 1 2 0 1 2 2 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0] 


Actual labels:
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0] 




### Simple Linear regresion Algorithm ###


In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.datasets import fetch_california_housing

# Load California housing data
california_housing = fetch_california_housing()
X = california_housing.data
y = california_housing.target

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict the test set results
y_pred = model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")

# Calculate R-squared (coefficient of determination)
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2}")


Mean Squared Error (MSE): 0.5558915986952424
Mean Absolute Error (MAE): 0.5332001304956981
R-squared: 0.5757877060324523
