This part was exicuted on Google Colab for faster computation

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

# Read the file
df_41 = pd.read_csv('/content/drive/MyDrive/KUL 2022-2024/SS2022/Modern Data Analytics/Noise and Weather/Export 41 and weather.csv')

# Create a dictionary to map the sub-types
sub_type_mapping = {
    'Transport road - Passenger car': 'Car',
    'Transport road - Siren': 'Car',
    'Human voice - Shouting': 'Human',
    'Human voice - Singing': 'Human',
    'Music non-amplified': 'Human',
    'Nature elements - Wind': 'Other',
    'Unsupported': 'Other'
}

# Map the sub-types using the dictionary
df_41['Modified Sub-Type'] = df_41['noise_event_laeq_primary_detected_class'].map(sub_type_mapping)

def model_tuning(months, seeds, test_sizes, n_estimators_list, random_states, cv_list):
    results = []

    for month in months:
        for seed in seeds:
            for test_size in test_sizes:
                for n_estimators in n_estimators_list:
                    for random_state in random_states:
                        for cv in cv_list:
                            # Filter data for the desired period (Q1)
                            df_41_Q1 = df_41.loc[df_41['month'] <= month]
                            # Define x and y
                            x = df_41_Q1[['description', 'hour', 'month', 'LC_RAD60', 'LC_WINDDIR', 'LC_DWPTEMP', 'LC_HUMIDITY', 'LC_WINDSPEED', 'weekday', 'LC_TEMP_QCL3']] #'month','weekday','description','hour','LC_DWPTEMP','LC_RAD','LC_RAININ', 'LC_HUMIDITY','LC_DAILYRAIN','LC_WINDDIR','LC_WINDSPEED','LC_RAD60', 'LC_TEMP_QCL0','LC_TEMP_QCL1','LC_TEMP_QCL2','LC_TEMP_QCL3'
                            y = df_41_Q1['Modified Sub-Type']
                            # Initialize the label encoders
                            label_encoder_x = LabelEncoder()
                            label_encoder_y = LabelEncoder()
                            x.loc[:, 'description'] = label_encoder_x.fit_transform(x['description'])

                            # Encode the target variable
                            y_encoded = label_encoder_y.fit_transform(y)

                            # Set the seed
                            seed = seed

                            # Split the data into training and test sets
                            x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=test_size, random_state=42)

                            # Apply SMOTE to oversample the minority classes in the training data
                            smote = SMOTE(random_state=42)
                            x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

                            # Train the model
                            model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
                            model.fit(x_train_resampled, y_train_resampled)

                            # Perform cross-validation
                            scores = cross_val_score(model, x_train_resampled, y_train_resampled, cv=cv, scoring='accuracy')

                            # Calculate the mean accuracy across folds
                            accuracy_cv = scores.mean()

                            # Predict on the test set
                            y_pred = model.predict(x_test)

                            # Calculate accuracy on the test set
                            accuracy_test = accuracy_score(y_test, y_pred)

                            # Record the results in a dictionary
                            result = {
                                'month': month,
                                'seed': seed,
                                'test_size': test_size,
                                'n_estimators': n_estimators,
                                'random_state': random_state,
                                'cv': cv,
                                'Cross-Validation Accuracy': accuracy_cv,
                                'Test Accuracy': accuracy_test
                            }

                            # Append the result to the list
                            results.append(result)
                            print("Importances:", model.feature_importances_)

    # Create a dataframe from the results list
    results_df = pd.DataFrame(results)

    return results_df

# Define the parameter values for tuning
months = [3, 6, 12] #3, 6, 12
seeds = [20] #, 71, 420
test_sizes = [0.2] #, 0.45
n_estimators_list = [50, 200] #, 200, 120
random_states = [42] # 
cv_list = [15] #, 20, 40

# Call the function to perform parameter tuning
results_df = model_tuning(months, seeds, test_sizes, n_estimators_list, random_states, cv_list)

# Display the results dataframe
print(results_df)
results_df.to_csv('result230604_correct.csv', index=False)