In [1]:
%pip install pm4py
%pip install --upgrade pip

Note: you may need to restart the kernel to use updated packages.
Collecting pip
  Using cached pip-24.2-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-24.2-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.3.1
    Uninstalling pip-23.3.1:
      Successfully uninstalled pip-23.3.1
Successfully installed pip-24.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pm4py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import pickle
import tqdm
import seaborn as sns
from pm4py.algo.discovery.dfg import algorithm as direct_follow_graph
from pm4py.visualization.dfg import visualizer as direct_follow_graph_vis


# Data Understanding

In [3]:
log = pm4py.read_xes("/Users/lorenzocassinelli/Desktop/Business Information System/Progetto/Road Traffic Fine Management Process/Road_Traffic_Fine_Management_Process.xes")
df = pm4py.convert_to_dataframe(log)

parsing log, completed traces ::   0%|          | 0/150370 [00:00<?, ?it/s]

In [None]:
df

In [None]:
# Data Exploration - Studying the Event Log
print(df.info())
print("\n** PRINTING NULL\n")
print(df.isnull().sum())

df['time:timestamp'] = pd.to_datetime(df['time:timestamp'])

num_events = len(df)
num_cases = len(df['case:concept:name'].unique())
print("Before Cleaning\n")
print("Number of events: {}\nNumber of cases: {}".format(num_events, num_cases))

start_activities = pm4py.get_start_activities(df)
end_activities = pm4py.get_end_activities(df)
print("Start activities:{}\nEnd activities:{}".format(start_activities,end_activities))

case_durations = df.groupby('case:concept:name').agg(\
Events=('case:concept:name', 'count'),\
# Multiple aggregations of the same column using pandas ...
FirstOccurence=('time:timestamp', lambda x: x.min()),
LastOccurence=('time:timestamp', lambda x: x.max()),
Duration=('time:timestamp', lambda x: x.max() - x.min()),
)
min_case_duration = case_durations['Duration'].min()
max_case_duration = case_durations['Duration'].max()
mean_case_duration = case_durations['Duration'].mean()

print("Min Case Duration: {}\nMax Case Duration: {}\nMean Case Duration: {}".format(min_case_duration, max_case_duration, mean_case_duration))


## **DATA OVERVIEW**

In [None]:
# Count frequency of activities
group_by_activity = df.groupby('concept:name').size().sort_values(ascending=False)

# Plotting the bar chart
plt.figure(figsize=(10, 6))
group_by_activity.plot(kind='bar', color='skyblue')
plt.title('Distribution of Activities')
plt.xlabel('Activity')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Create subplots with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

# Plot pie chart for start activities
ax1.pie(start_activities.values(), labels=start_activities.keys(), autopct='%1.1f%%')
ax1.set_title('Start Activities')

# Plot pie chart for end activities
ax2.pie(end_activities.values(), labels=end_activities.keys(), autopct='%1.1f%%')
ax2.set_title('End Activities')

plt.show()


## **FREQUENCY ON ARTICLES AND POINTS**

In [None]:
df_filtered_frequency = df.dropna(subset=['amount', 'points'])
df_filtered_payment = df_filtered_frequency[df_filtered_frequency['amount'] > 0]

# Calculate total amount (sum) for each article:
total_amount_per_article = df_filtered_payment.groupby('article')['amount'].sum()

# Calculate frequency (count) for articles:
article_frequency = df_filtered_frequency['article'].value_counts()

# Calculate frequency (count) for points:
point_frequency = df_filtered_frequency['points'].value_counts()

# Function to adjust plot layout for readability
def adjust_plot_layout(plt):
    plt.xticks(rotation=90)
    plt.tight_layout()

# Plot 1: Max Value Paid per Article
plt.figure(figsize=(10, 6))

total_amount_per_article.plot(kind='bar', title='Max Value To Be Paid per Article')
plt.xlabel('Article ID')
plt.ylabel('Amount')
adjust_plot_layout(plt)
plt.show()

# Plot 2: Article Frequency
plt.figure(figsize=(10, 6))

article_frequency.plot(kind='bar', title='Frequency of Articles')
plt.xlabel('Article ID')
plt.ylabel('Frequency (Count)')
adjust_plot_layout(plt)
plt.show()

# Plot 3: Point Frequency
plt.figure(figsize=(8, 5))

point_frequency.plot(kind='bar', title='Frequency of Points Removed')
plt.xlabel('Points')
plt.ylabel('Frequency (Count)')
adjust_plot_layout(plt)
plt.show()


# Data Cleaning and Filtering


We need now to filter out cases that are not useful, we know that the only accettable ending cases are: 'Payment' or 'Send for credit collection'.
We also know that ending cases can be 'Send appeal to prefecture' and 'Appeal to judge' BUT only if appel is win, so if dismissal is '#' or 'G'

In [None]:
# Removing useless columns
filtered_log = df.drop(columns=['vehicleClass', 'lifecycle:transition', 'notificationType', 'lastSent', 'matricola'], axis=1)

In [None]:
# Delete NULL duration
filtered_log = pm4py.filter_case_performance(filtered_log, 0, 0)
mask = df['case:concept:name'].isin(filtered_log['case:concept:name'])
inverse_mask = ~mask
df_filtered_null_duration = df[inverse_mask]
print("Given {} total cases in the log we have {} cases that comply with the applied filter 'REMOVING NULL DURATION'".format(len(df['case:concept:name'].unique()), len(filtered_log['case:concept:name'].unique())))
print("Now our df have len {} filtering null duration cases".format(len(df_filtered_null_duration['case:concept:name'].unique())))

num_events = len(df_filtered_null_duration)
num_cases = len(df_filtered_null_duration['case:concept:name'].unique())
print("Number of events: {}\nNumber of cases: {}".format(num_events, num_cases))

In [None]:
# Deleting incomplete cases
df_filtered_incomplete_cases = pm4py.filter_end_activities(df_filtered_null_duration, ['Payment', 'Send for Credit Collection', 'Send Appeal to Prefecture', 'Appeal to Judge'])
print("Given {} total cases in the log we have {} cases that comply with the applied 'REMOVING INCOMPLETE CASES'".format(len(df_filtered_null_duration['case:concept:name'].unique()), len(df_filtered_incomplete_cases['case:concept:name'].unique())))
mask = df_filtered_null_duration['case:concept:name'].isin(df_filtered_incomplete_cases['case:concept:name'])
df_filtered_incomplete_cases = df_filtered_null_duration[mask]
print("Now our df have len {} filtering null duration cases".format(len(df_filtered_incomplete_cases['case:concept:name'].unique())))


filtered_end_act = pm4py.filter_end_activities(df_filtered_incomplete_cases, ['Send Appeal to Prefecture', 'Appeal to Judge'])

def filter_ending_activities(row):
  dismissal = row['dismissal']

  if dismissal == 'G' or dismissal == '#':
    return True  # Include these only if dismissal is 'G' or '#'
  else:
    return False  # Exclude other ending activities

indexes_of_not_conf = filtered_end_act[filtered_end_act.apply(filter_ending_activities, axis=1)]
mask = df_filtered_incomplete_cases['case:concept:name'].isin(indexes_of_not_conf['case:concept:name'])
inverse_mask = ~mask
filtered_df = df_filtered_incomplete_cases[inverse_mask]


print("Given {} total cases in the log we have {} cases that comply with the applied 'REMOVING INCOMPLETE DISMISSAL CASES'".format(len(df_filtered_incomplete_cases['case:concept:name'].unique()), len(filtered_df['case:concept:name'].unique())))
print("Now our df have len {} filtering null duration cases".format(len(filtered_df['case:concept:name'].unique())))
num_events = len(filtered_df)
num_cases = len(filtered_df['case:concept:name'].unique())
print("Number of events: {}\nNumber of cases: {}".format(num_events, num_cases))

In [None]:
filtered_df

In [None]:
# Filling NaN value with 0
filtered_df = filtered_df.fillna(0)
filtered_df

In [None]:
start_activities = pm4py.get_start_activities(filtered_df)
end_activities = pm4py.get_end_activities(filtered_df)
print("Start activities:{}\nEnd activities:{}".format(start_activities,end_activities))

# Variants Analysis

In [None]:
# Retrieve the variants of the conformant event logs, creates a DataFrame from the variants dictionary
variants = pm4py.get_variants(filtered_df)

variants_df = pd.DataFrame.from_dict(variants, orient='index', columns=['Count'])
variants_df = variants_df.reset_index()

variants_df = variants_df.rename(columns={'index': 'Variant'})
variants_df = variants_df.sort_values(by=['Count'], ascending=False)
variants_df = variants_df.reset_index(drop=True)

variants_df

In [None]:
# Filter Out variants like -> Create Fine;Appeal To Judge Variants and Create Fine;Insert Appel to Prefecture
variants = pm4py.get_variants(filtered_df)
anomalies = set()
count=0
for variant in variants:
    for i in range(len(variant)):

        if variant[i] == 'Send Fine':
             if variant[i-1] not in ['Create Fine', 'Payment']:
                anomalies.add(variant)
                count+= variants[variant]

print("Anomalies: ",format(count))

final_filtered_df = pm4py.filter_variants(filtered_df, anomalies, activity_key='concept:name', case_id_key='case:concept:name', timestamp_key='time:timestamp', retain=False)
print("FINAL FILTERED ON VARIANTS DF HAS LEN: ",len(final_filtered_df['case:concept:name'].unique()),"cases")

In [None]:
## Let's extract the top k variants
variants_top_k = pm4py.filter_variants_top_k(final_filtered_df, 5)

variants_top_k_df = pd.DataFrame.from_dict(variants, orient='index', columns=['Count'])
variants_top_k_df = variants_df.reset_index()

variants_top_k_df = variants_df.rename(columns={'index': 'Variant'})
variants_top_k_df = variants_df.sort_values(by=['Count'], ascending=False)
variants_top_k_df = variants_df.reset_index(drop=True)

## let's compute case duration for the filtered log
case_durations_f = final_filtered_df.groupby('case:concept:name').agg(\
Events=('case:concept:name', 'count'),\
# Multiple aggregations of the same column using pandas ...
FirstOccurence=('time:timestamp', lambda x: x.min()),
LastOccurence=('time:timestamp', lambda x: x.max()),
Duration=('time:timestamp', lambda x: x.max() - x.min()),
)

min_case_duration_f = case_durations_f['Duration'].min()
max_case_duration_f = case_durations_f['Duration'].max()
mean_case_duration_f = case_durations_f['Duration'].mean()
print("TOTAL event log \nMin Case Duration: {}\nMax Case Duration: {}\nMean Case Duration: {}".format(min_case_duration, max_case_duration, mean_case_duration))
print("\n\nFILTERED event log \nMin Case Duration: {}\nMax Case Duration: {}\nMean Case Duration: {}".format(min_case_duration_f, max_case_duration_f, mean_case_duration_f))

In [None]:
variants_top_k_df

# STATISTIC ABOUT VARIANTS NEAR TO 0 (25th percentile) AND VARIANTS NEAR TO MAXIMUM (75th percentile)

In [None]:
def plot_cdf(data, percentiles, title, xlabel, ylabel, legend_label):
  sorted_data = np.sort(data)  # Use a more descriptive name
  cumulative_probs = np.arange(1, len(sorted_data) + 1) / len(sorted_data)  # Clearer variable name

  plt.figure(figsize=(10, 6))
  ax = plt.gca()  # Get current axis for cleaner code

  ax.plot(sorted_data, cumulative_probs, marker='o', linestyle='-', color='b', label=legend_label)

  ax.set_title(title, fontsize=16, weight='bold')
  ax.set_xlabel(xlabel, fontsize=14, weight='bold')
  ax.set_ylabel(ylabel, fontsize=14, weight='bold')

  ax.grid(True, which='both', linestyle='--', linewidth=0.5)

  for percentile in percentiles:
    percentile_value = np.percentile(sorted_data, percentile)
    ax.axvline(percentile_value, color='r', linestyle='--', linewidth=1, label=f"{percentile}%")
    ax.text(percentile_value, 0.5, f"{percentile_value:.2f}", color='r', ha='center', fontsize=12,
            bbox=dict(facecolor='white', alpha=0.6, edgecolor='red'))

  # Improved legend handling with potential for multiple lines
  handles, labels = ax.get_legend_handles_labels()
  by_label = dict(zip(labels, handles))
  ax.legend(by_label.values(), by_label.keys())

  plt.show()


In [None]:
#plot the cdf of the case duration at the beginning
plot_cdf(case_durations['Duration'].dt.total_seconds() / (24 * 60 * 60), (25,50,75), title='CDF of Case Durations', xlabel='Case Duration (days)', ylabel='CDF', legend_label='Case Duration')

#print 25, 50, 75 percentiles
print("25th percentile: ", case_durations['Duration'].quantile(0.25))
print("50th percentile: ", case_durations['Duration'].quantile(0.50))
print("75th percentile: ", case_durations['Duration'].quantile(0.75))
print(len(case_durations))

In [None]:
plot_cdf(case_durations_f['Duration'].dt.total_seconds() / (24 * 60 * 60), (25,50,75), title='CDF of Case Durations', xlabel='Case Duration (days)', ylabel='CDF', legend_label='Case Duration')

#print 25, 50, 75 percentiles
print("25th percentile: ", case_durations_f['Duration'].quantile(0.25))
print("50th percentile: ", case_durations_f['Duration'].quantile(0.50))
print("75th percentile: ", case_durations_f['Duration'].quantile(0.75))
print(len(case_durations_f))

In [None]:
duration = case_durations_f['Duration'].dt.total_seconds() / (24 * 60 * 60)
hist, bins = np.histogram(duration, bins='auto', density=True)
pdf = hist / np.sum(hist)

# Plot the PDF
plt.bar(bins[:-1], pdf, width=np.diff(bins), align='edge')
plt.xlabel('Duration')
plt.ylabel('Probability')
plt.title('Probability Distribution Function of Duration')
plt.show()

In [None]:
#get the 25 percentile
percentile_25 = np.percentile(case_durations_f['Duration'].dt.total_seconds() / (24 * 60 * 60), 25)
print(f"25th percentile: {percentile_25}")

#get cases with duration < 25th percentile
p25_cases = case_durations_f[case_durations_f['Duration'].dt.total_seconds() / (24 * 60 * 60) < percentile_25]
p25_variants = pm4py.get_variants(df[df['case:concept:name'].isin(p25_cases.index)])
p25_variants

In [None]:
#get the 75 percentile
percentile_75 = np.percentile(case_durations_f['Duration'].dt.total_seconds() / (24 * 60 * 60), 75)
print(f"75th percentile: {percentile_75}")

#get cases with duration > 75th percentile
p75_cases = case_durations_f[case_durations_f['Duration'].dt.total_seconds() / (24 * 60 * 60) > percentile_75]
p75_variants = pm4py.get_variants(df[df['case:concept:name'].isin(p75_cases.index)])
p75_variants

# Process Discovery

In [None]:
def evaluation(dataframe, net, im, fm):
  precision = pm4py.precision_token_based_replay(dataframe,net,im,fm)
  print("Precision: ", precision)
  fitness = pm4py.fitness_token_based_replay(dataframe,net,im,fm)
  print("Fitness: ", fitness)

  simplicity = pm4py.algo.evaluation.simplicity.algorithm.apply(net)
  print("Simplicity: ", simplicity)

  generalization = pm4py.algo.evaluation.generalization.algorithm.apply(dataframe,net,im,fm)
  print("Generalization: ", generalization)

In [None]:
def conformance_diagnostics(net, im, fm):
  tbr_diagnostics = pm4py.conformance_diagnostics_token_based_replay(filtered_df, net, im, fm, return_diagnostics_dataframe=True)
  diagnostics_df = pd.DataFrame.from_dict(tbr_diagnostics)

  return diagnostics_df

In [None]:
footprints = pm4py.discover_footprints(final_filtered_df)
pm4py.view_footprints(footprints, format='png')

In [None]:
dfg = direct_follow_graph.apply(final_filtered_df)

gviz = direct_follow_graph_vis.apply(dfg, log=final_filtered_df, variant=direct_follow_graph_vis.Variants.FREQUENCY)
direct_follow_graph_vis.view(gviz)

In [None]:
# Discover petri net using Alpha miner
net, im, fm = pm4py.discover_petri_net_alpha(final_filtered_df)

pm4py.view_petri_net(net, im, fm, format='png')

evaluation(filtered_df, net, im, fm)

print(conformance_diagnostics(net, im, fm))

In [None]:
# Discover petri net using Inductive miner
net, im, fm = pm4py.discover_petri_net_inductive(final_filtered_df)

pm4py.view_petri_net(net, im, fm, format='png')

evaluation(filtered_df, net, im, fm)

print(conformance_diagnostics(net, im, fm))

In [None]:
# Discover petri net using Heuristics miner
net, im, fm = pm4py.discover_petri_net_heuristics(final_filtered_df,
                                                  activity_key='concept:name',
                                                  case_id_key='case:concept:name',
                                                  timestamp_key='time:timestamp')

pm4py.view_petri_net(net, im, fm, format='png')

evaluation(filtered_df, net, im, fm)

diagnostics_df = conformance_diagnostics(net, im, fm)

In [None]:
# Discover petri net using ILP
net, im, fm = pm4py.discover_petri_net_ilp(final_filtered_df,
                                           activity_key='concept:name',
                                           case_id_key='case:concept:name',
                                           timestamp_key='time:timestamp')

pm4py.view_petri_net(net, im, fm, format='png')

evaluation(filtered_df, net, im, fm)

print(conformance_diagnostics(net, im, fm))

# Conformance Checking

In [None]:
## Get the Case_ID of the cases with low fitness values
disfuntional_traces = diagnostics_df['case_id'][diagnostics_df['trace_fitness'] <= 0.69].tolist()
print(len(disfuntional_traces))
print(disfuntional_traces)

## removing them
df_cleaned_on_low_fitness = pm4py.filter_event_attribute_values(final_filtered_df, 'case:concept:name', disfuntional_traces, retain=False)

num_events = len(df_cleaned_on_low_fitness)
num_cases = len(df_cleaned_on_low_fitness['case:concept:name'].unique())
print("Number of events: {}\nNumber of cases: {}".format(num_events, num_cases))

In [None]:
# Conformance checking diagnostic using Aligments
aligments = pm4py.conformance_diagnostics_alignments(df_cleaned_on_low_fitness, net, im, fm)

In [None]:
deviations = []

#extract deviations
for trace in aligments:
    for step in trace['alignment']:
        if step[0] != step[1] and step[0] != None and step[1] != None:
            deviations.append(step)

# stat of deviations
num_deviations = len(deviations)
num_log_moves = sum(1 for step in deviations if step[0] == '>>')
num_model_moves = sum(1 for step in deviations if step[1] == '>>')

print(f'Total deviations: {num_deviations}')
print(f'Log moves (events in log not in model): {num_log_moves}')
print(f'Model moves (events in model not in log): {num_model_moves}')

if deviations:
    print("\nTop 5 Deviations:")
    for i, deviation in enumerate(deviations[:5]):
        print(f"{i+1}. {deviation} (Count: {deviations.count(deviation)})")
else:
    print("\nNo deviations found.")

In [None]:
df_cleaned_on_low_fitness

# Machine Learning


In [None]:
# Filter the DataFrame based on the 'case:concept:name' column
new_df = df_cleaned_on_low_fitness.groupby('case:concept:name').agg(\
Duration=('time:timestamp', lambda x: x.max() - x.min()),
)

new_df = new_df.reset_index()
df_with_duration = pd.merge(df_cleaned_on_low_fitness, new_df, on='case:concept:name', how='left')

In [None]:
df_with_duration

In [None]:
df_with_duration = df_with_duration[df_with_duration['concept:name'] == 'Create Fine']
df_with_duration = df_with_duration[['amount', 'points', 'article', 'org:resource', 'vehicleClass', 'Duration','case:concept:name']]

In [None]:
df_with_duration['duration_days'] = df_with_duration['Duration'].dt.days
bins = [0, 5, 90, np.inf]
labels = ['short', 'medium', 'long']
df_with_duration['duration_category'] = pd.cut(df_with_duration['duration_days'], bins=bins, labels=labels)
df_with_duration = df_with_duration.drop(columns=['duration_days', 'Duration'], axis=1)
df_with_duration

In [None]:
category_counts = df_with_duration['duration_category'].value_counts()
print(category_counts)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

df_merged = df_cleaned_on_low_fitness[df_cleaned_on_low_fitness['amount'] != 0]
X = df_with_duration.drop("duration_category", axis=1)
X = X.drop(["case:concept:name"], axis=1)
y = df_with_duration["duration_category"]

In [None]:
!pip install imbalanced-learn

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

In [None]:
category_counts = y_resampled.value_counts()
print(category_counts)

In [None]:
# Create a LabelEncoder object
le = LabelEncoder()

# Fit and transform the 'City' column
X_resampled['vehicle_class_econded'] = le.fit_transform(X_resampled['vehicleClass'])
y_encoded = le.fit_transform(y_resampled)
X_resampled.drop('vehicleClass', axis=1, inplace=True)

In [None]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X_resampled)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Create a Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

In [None]:
param_grid = {
    'n_estimators':
    [100, 200, 300],
    'criterion': ["gini", "entropy", "log_loss"],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2', None]
}
clf = RandomForestClassifier()
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

XGBoost

In [None]:
%pip install xgboost
import xgboost as xgb

In [None]:
xgb_model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.01)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

LogisticRegression


In [None]:
from sklearn.linear_model import LogisticRegression
grid_search_lr = LogisticRegression()
grid_search_lr.fit(X_train, y_train)
y_pred = grid_search_lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

SVM

In [None]:
from sklearn.svm import SVC
grid_search_svc = SVC()
grid_search_svc.fit(X_train, y_train)
y_pred = grid_search_svc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))