In [43]:
# Exercise 1

import pandas as pd

data = pd.read_csv('garments_worker_productivity.csv')

# Drop the date column
if 'date' in data.columns:
    data.drop(columns=['date'], inplace=True)

# For each of the categorical attributes, print all the unique elements.
categorical_columns = ['day', 'quarter', 'department', 'team']
unique_values = {col: data[col].unique() for col in categorical_columns}
print(unique_values)
# From the unique values output, we can see that the there is a duplicated values in department column, which is 'finishing'
# since one of them has extra space after the word.
# We need to clean the data

# Data cleaning for categorical attributes
data['day'] = data['day'].str.strip().str.lower()
data['quarter'] = data['quarter'].str.strip().str.lower()
data['department'] = data['department'].str.strip().str.lower()

unique_values = {col: data[col].unique() for col in categorical_columns}

# Create another column named satisfied that records the productivity performance.
data['satisfied'] = (data['actual_productivity'] >= data['targeted_productivity']).astype(int)

# Drop the columns actual_productivity and targeted_productivity
data.drop(columns=['actual_productivity', 'targeted_productivity'], inplace=True)

# Find and print which columns/attributes have empty values
missing_columns = data.columns[data.isnull().any()]
print(missing_columns) # wip has empty value

# Fill the empty values with 0
data.fillna(0, inplace=True)

{'day': array(['Thursday', 'Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday'],
      dtype=object), 'quarter': array(['Quarter1', 'Quarter2', 'Quarter3', 'Quarter4', 'Quarter5'],
      dtype=object), 'department': array(['sweing', 'finishing ', 'finishing'], dtype=object), 'team': array([ 8,  1, 11, 12,  6,  7,  2,  3,  9, 10,  5,  4])}
Index(['wip'], dtype='object')


In [42]:
# Exercise 2

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import classification_report

# Exercise 2.1
# For each of the categorical attributes, encode the set of categories using integers 0, ... n - 1
label_encoders = {}
for col in ['day', 'quarter', 'department', 'team']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Split the data into training and testing sets with a ratio of 80:20
X = data.drop(columns=['satisfied'])
y = data['satisfied']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Exercise 2.2
# Use the categorical attributes only
categorical_features = ['day', 'quarter', 'department', 'team']

# please build a Categorical Naïve Bayes classifier that predicts the column satisfied. 
X_train_cat = X_train[categorical_features]
X_test_cat = X_test[categorical_features]

nb_model = CategoricalNB()
nb_model.fit(X_train_cat, y_train)

y_pred = nb_model.predict(X_test_cat)

# Report the testing result using classification_report
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()
print(report_df)

              precision    recall  f1-score  support
0              0.434783  0.158730  0.232558   63.000
1              0.755760  0.926554  0.832487  177.000
accuracy       0.725000  0.725000  0.725000    0.725
macro avg      0.595271  0.542642  0.532523  240.000
weighted avg   0.671504  0.725000  0.675006  240.000


In [53]:
# Exercise 3

from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import pandas as pd

# Exercise 3.1
# For each of the categorical attributes, encode them with one-hot encoding.
encoder = OneHotEncoder(sparse=False, drop='first')
X_categorical_encoded = encoder.fit_transform(data[['day', 'quarter', 'department', 'team']])

encoded_feature_names = encoder.get_feature_names_out(['day', 'quarter', 'department', 'team'])
X_categorical_df = pd.DataFrame(X_categorical_encoded)

X_numerical = data.drop(columns=['day', 'quarter', 'department', 'team', 'satisfied'])
X_encoded = pd.concat([X_categorical_df, X_numerical.reset_index(drop=True)], axis=1)

# Split the data into training and testing sets with a ratio of 80:20.
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


# Exercise 3.2
# Using all the attributes we have, please build a SVM that predicts the column satisfied.
X_train_svm.columns = X_train_svm.columns.astype(str)
X_test_svm.columns = X_test_svm.columns.astype(str)

scaler = StandardScaler()
X_train_svm_scaled = scaler.fit_transform(X_train_svm)
X_test_svm_scaled = scaler.fit_transform(X_test_svm)

# Build one SVM with a linear kernel.
svm_linear = SVC(kernel='linear', random_state=42)
svm_linear.fit(X_train_svm_scaled, y_train_svm)
y_pred_linear = svm_linear.predict(X_test_svm_scaled)

# Build another SVM with an RBF kernel.
svm_rbf = SVC(kernel='rbf', random_state=42)
svm_rbf.fit(X_train_svm_scaled, y_train_svm)
y_pred_rbf = svm_rbf.predict(X_test_svm_scaled)

# Report the testing results of both models using classification_report.
report_linear = classification_report(y_test_svm, y_pred_linear, output_dict=True)
report_rbf = classification_report(y_test_svm, y_pred_rbf, output_dict=True)

report_linear_df = pd.DataFrame(report_linear).transpose()
report_rbf_df = pd.DataFrame(report_rbf).transpose()

print(report_linear_df)

print(report_rbf_df)


# Exercise 3.3

# For the column satisfied in the training set, print the frequency of each class.
class_counts_before = y_train_svm.value_counts()
print(class_counts_before)

# Oversample the training data.
smote = SMOTE(random_state=42)
X_train_svm_resampled, y_train_svm_resampled = smote.fit_resample(X_train_svm_scaled, y_train_svm)

# For the column satisfied in the oversampled data, print the frequency of each class again.
class_counts_after = pd.Series(y_train_svm_resampled).value_counts()
print(class_counts_after)

# Re-build the two SVMs using the same settings as in Exercise 3.2, but use the oversampled training data instead.
# Since we used scaled data, we do not need to scale again
svm_linear_resampled = SVC(kernel='linear', random_state=42)
svm_linear_resampled.fit(X_train_svm_resampled, y_train_svm_resampled)
y_pred_linear_resampled = svm_linear_resampled.predict(X_test_svm_scaled)

svm_rbf_resampled = SVC(kernel='rbf', random_state=42)
svm_rbf_resampled.fit(X_train_svm_resampled, y_train_svm_resampled)
y_pred_rbf_resampled = svm_rbf_resampled.predict(X_test_svm_scaled)

# Report the testing results using classification_report.
report_linear_resampled = classification_report(y_test_svm, y_pred_linear_resampled, output_dict=True)
report_rbf_resampled = classification_report(y_test_svm, y_pred_rbf_resampled, output_dict=True)

report_linear_resampled_df = pd.DataFrame(report_linear_resampled).transpose()
report_rbf_resampled_df = pd.DataFrame(report_rbf_resampled).transpose()

print(report_linear_resampled_df)
print(report_rbf_resampled_df)




              precision    recall  f1-score     support
0              0.689655  0.317460  0.434783   63.000000
1              0.796209  0.949153  0.865979  177.000000
accuracy       0.783333  0.783333  0.783333    0.783333
macro avg      0.742932  0.633306  0.650381  240.000000
weighted avg   0.768238  0.783333  0.752790  240.000000
              precision    recall  f1-score   support
0              0.600000  0.285714  0.387097   63.0000
1              0.785714  0.932203  0.852713  177.0000
accuracy       0.762500  0.762500  0.762500    0.7625
macro avg      0.692857  0.608959  0.619905  240.0000
weighted avg   0.736964  0.762500  0.730489  240.0000
satisfied
1    698
0    259
Name: count, dtype: int64
satisfied
1    698
0    698
Name: count, dtype: int64
              precision    recall  f1-score     support
0              0.431373  0.698413  0.533333   63.000000
1              0.862319  0.672316  0.755556  177.000000
accuracy       0.679167  0.679167  0.679167    0.679167
macro av