# ML-Pipeline

## 0. Dependencies

In [2]:
# setup dependencies
import sys
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# extra code – the next 5 lines define the default font sizes
plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)

## 1. Read & Introduce Data

In [None]:
read_df = pd.read_csv('data.csv',sep=';')
read_df.head()

In [None]:
read_df.info()

In [None]:
read_df.describe()

### 1.1 Feature Description
list of features and their meaning

## 2. Analyse Data

In [None]:
analysis_df = read_df

### 2.1 Histograms

In [None]:
analysis_df.hist(figsize=(20,15))

### 2.2 Boxplots

In [None]:
box_plot_features = ['<FEAT_1>', '<FEAT_N>']

In [None]:

analysis_df = read_df[box_plot_features]
red_circle = dict(markerfacecolor='red', marker='o', markeredgecolor='white')

fig, axs = plt.subplots(1, len(analysis_df.columns), figsize=(20,10))

for i, ax in enumerate(axs.flat):
    ax.boxplot(analysis_df.iloc[:,i], flierprops=red_circle)
    ax.set_title(analysis_df.columns[i], fontsize=20, fontweight='bold')
    ax.tick_params(axis='y', labelsize=14)
    
plt.tight_layout()

## 3. Train Test Split

In [None]:
test_train_df = read_df

In [None]:
target_prediction = '<TARGET>'

In [None]:
from sklearn.model_selection import train_test_split
test_size = 0.2
y = test_train_df.drop(target_prediction)
# todo: compare holdout to cross validtion
#   - Pay attention to your splits and settings
#       Are there differences? Why? In which metrics? What could have caused it?
#   - Compare/document changes in runtime behaviour with the changing eg. dataset size
X_train, X_test, y_train, y_test = train_test_split(test_train_df, y, test_size=test_size, random_state=42)
(len(X_train), len(X_test))

## 4. Correlations

In [None]:
correlation_df = X_train.copy()
correlation_df[target_prediction] = y_train

In [None]:
correlation_df.corr(numeric_only=True)[target_prediction].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(correlation_df, figsize=(20,10))

## 5. Feature Combination

In [None]:
feature_combination_df = X_train.copy()
feature_combination_df[target_prediction] = y_train

In [None]:
# create N new feature from existing ones
correlation_df['<NEW_FEATURE>'] = correlation_df['<OLD_FEATURE_1>']/correlation_df['<OLD_FEATURE_1>']

In [None]:
correlation_df.corr(numeric_only=True)[target_prediction].sort_values(ascending=False)

## 6. Preproccessing & Feature Scaling

In [None]:
preprocessing_df = X_train.copy()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
# todo: prepare data (missing values, outliers, scaling, encoding, etc.)
preprocessor = Pipeline([
    ('std_scaler', StandardScaler()),
])
nan_and_target = [target_prediction, '<NOT_NUMERIC>']
preprocessing_df_num = preprocessing_df.drop(columns=nan_and_target)
piped_preprocessing_df = preprocessor.fit_transform(preprocessing_df_num)
piped_preprocessing_df

## 7. Model Selection

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# todo: implement proper preprocessing for each dataset (missing values, outliers, scaling, encoding, etc.)
pipeline = preprocessor
X_train_scaled = pipeline.fit_transform(X_train)
X_test_scaled = pipeline.transform(X_test)

# todo: pick only 3 classifiers, but make sure that 
#   you choose from at least two different "types"/"paradigms"
#   eg do not choose 3 tree-based classifiers, or 3 NN based classifiers, or 3 ensembles, ...
classifiers = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='rbf', random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Neural Network": MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
}
# todo: Run classifiers, and Experiments with:
#   - Different classifiers and your datasets
#   - Different parameter settings (= several results per classifier per dataset, not only random/best)
results = {}
for name, clf in classifiers.items():
    # Train the model
    clf.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = clf.predict(X_test_scaled)

    # Calculate accuracy and perform cross-validation
    accuracy = accuracy_score(y_test, y_pred)
    # todo: compare holdout to cross validtion
    #   - Pay attention to your splits and settings
    #       Are there differences? Why? In which metrics? What could have caused it?
    #   - Compare/document changes in runtime behaviour with the changing eg. dataset size
    cv_scores = cross_val_score(clf, X_train_scaled, y_train, cv=5)

    results[name] = {
        "accuracy": accuracy,
        "cv_mean": np.mean(cv_scores),
        "cv_std": np.std(cv_scores)
    }

# Print results
for name, result in results.items():
    print(f"\n{name}:")
    print(f"Test Accuracy: {result['accuracy']:.4f}")
    print(f"Cross-validation: {result['cv_mean']          :.4f} (+/- {result['cv_std']*2:.4f})")

# todo: Can you identify any patterns/trends?
#   - Which methods work well and which did not, is there e.g. one method
#       outperforming the others on all datasets?
#   - How do the results change when preprocessing strategies change? How sensitive
#       is an algorithm to parameter settings?
#   - Are there differences across the datasets? Design your experiments so that you
#       can investigate the influence of single parameters.
# Compare accuracies
best_classifier = max(results, key=lambda x: results[x]['accuracy'])
print(f"\nBest Classifier: {best_classifier} with accuracy {
      results[best_classifier]['accuracy']:.4f}")

# todo: Evaluate and analyse the performance (primarily effectiveness, 
#   but also provide basic details on efficiency):
#   - Choose suitable, multiple performance measures
#   - Make valid comparisons (among the classifiers, across your datasets, parameters,
#       preprocessing effects...)
#   - (How) can you improve the results?

## 8. Results

Summarise your results in tables, figures! <br>
Document your findings, issues in your report <br>
Upload your best results to Kaggle competition (more information below) <br>
You do not need to implement the algorithms, rely on libraries/modules <br>
- Code just for loading data, pre-processing, running configurations, processing/aggregating results, …

### 8.1 Grading key points:
- datasets & classifiers description/choice reasoning, preprocessing
- classification experiments
- analysis of results, summary, interesting findings
- submission package & report(formal requirements, clarity, structure)

Keep in mind that the grading categories are dependent on each other (e.g. if you do not use preprocessing when
needed, your classification and overall analysis will suffer) <br>
Your methodology and reasoning are more important for grading than just achieving the highest e.g. accuracy when
performing classification

### 8.2 Pointers for your project
Apply the knowledge from the lectures <br>
Document the whole process <br>
Carefully design your experiments:
- work out your experiment design together as a group

Important points:
- Explain your choice of datasets, introduce them, their characteristics
- Briefly describe the preprocessing steps and argue why you chose them
    - Evaluate their impact on the results (mainly scaling)
- Explain your choice of classifiers, describe their characteristics
    - there is no need to give lengthy explanation about how a classifier works (do not repeat
what you heard in the lecture)
- Argue on your choice of performance measures
    - Think and find multiple, suitable measures, argue why you chose them (why are
they necessary, what do they measure/tell us about the performance), and if they
are sufficient
- In the report, include a paragraph briefly describing the steps you took to ensure
that the performance of the classifiers can be compared (think if the
comparison makes sense & research what needs to be fulfilled in order to e.g. compare
the performance of multiple classifiers on one dataset, how to compare the impact of
parameter changes etc.)
- Discuss your experimental results, compare them using tables and figures
- Provide an aggregated comparison of your results as well - i.e. a big table of the
best settings and results for all combinations (and a
summary/findings/conclusions!)
    - The idea is to extract knowledge from your results, not just list everything without
explanations