In [None]:
# @title Loading the  data into Drive from Kaggle

# Install Kaggle library
!pip install kaggle

# Install colab and mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Move Kaggle API key to the appropriate directory
!mkdir -p ~/.kaggle
!cp /content/kaggle.json  ~/.kaggle/

# Download the dataset
!kaggle competitions download -c learning-of-structured-data-fhws-ws2324

# Unzip the dataset
!unzip -q learning-of-structured-data-fhws-ws2324.zip

In [None]:
# @title Imported Necessary Libraries
import numpy as np
import pandas as pd
import glob
import os
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [None]:
# @title Function to extract Labels and DataFrame
def load_dataset(folder_path):
    """
    Load the dataset from the given folder path.
    :param folder_path: The path to the folder containing the dataset.
    :return: The dataset as a pandas dataframe.
    """
    # Get all the csv files in the folder
    csv_files = glob.glob(os.path.join(folder_path, "*.csv"))

    # Load all the csv files into a list of dataframes
    dataframes = []
    sizes = []
    names = []

    # Define column names
    columns_names = ["F" + str(i) for i in range(1, 80)]

    # Iterate through each CSV file
    for csv_file in tqdm(csv_files, desc="Loading Data", unit="file"):
        # Load CSV into a temporary dataframe
        temp_df = pd.read_csv(csv_file, na_values='', header=None, names=columns_names)

        # Record the size of the dataframe
        sizes.append(temp_df.shape[0])
        name = csv_file.split('_')[-1].split('.')[0]
        names.append(name)

    # Find the smallest size among all dataframes
    smallest = min(sizes)

    # Iterate through each CSV file again
    for csv_file in tqdm(csv_files, desc="Loading Data", unit="file"):
        # Load CSV into a temporary dataframe
        temp_df = pd.read_csv(csv_file, na_values='', header=None, names=columns_names)

        # Get the label from the file name
        label = csv_file.split('_')[-1].split('.')[0]

        # Add the label to the dataframe
        temp_df['label'] = label

        # Append the first 'smallest' rows to the list of dataframes
        dataframes.append(temp_df[:194])

    # Concatenate all the dataframes into one
    dataset = pd.concat(dataframes)

    # Return the dataset and  Lables
    return dataset,names

In [None]:
# @title Load the training dataset
folder_path = '/content/train/train'
data_df, actions = load_dataset(folder_path)

In [None]:
np.array(actions).shape

In [None]:
# @title Handle missing values by filling them with mean
df = data_df.fillna(data_df.mean())

In [None]:
df['label'].value_counts()

In [None]:
# @title Reshaped  data to suitable shape
df1 = df.iloc[:,:-1]
df2 = df1.values.reshape(1167,194,79)

In [None]:
# @title Reshape the data to meet XGBoost requirements
X_data_reshaped = df2.reshape(df2.shape[0], -1)
y_data = actions

In [None]:
# @title Use LabelEncoder to convert string labels to numeric labels
label_encoder = LabelEncoder()
y_data_encoded = label_encoder.fit_transform(y_data)

In [None]:
# @title Check the unique classes (now in numeric format)
unique_classes_encoded = np.unique(y_data_encoded)
print("Unique Classes (Encoded):", unique_classes_encoded)

In [None]:
# @title Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_data_reshaped, y_data_encoded, test_size=0.2, random_state=42)

In [None]:
# @title Create an XGBoost classifier
model = XGBClassifier(objective='multi:softmax', num_class=5)

In [None]:
# @title Train the model
model.fit(X_train, y_train)

In [None]:
# @title Make predictions on the test set
y_pred = model.predict(X_test)

In [None]:
# @title Decode numeric predictions back to string labels
y_pred_decoded = label_encoder.inverse_transform(y_pred)

In [None]:
# @title Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

In [None]:
# @title Load the test dataset
test_folder_path = '/content/test/test'
test_df,names1 = load_dataset(test_folder_path)

In [None]:
test_df

In [None]:
# @title Handle missing values by filling them with mean
df4 = test_df.fillna(test_df.mean())

In [None]:
# @title Removing Model Column
df5 = df4.iloc[:,:-1]
X_new = df5.values.reshape(305,194,79)

In [None]:
X_new.shape

In [None]:
# @title Reshape the new data
X_new_reshaped = X_new.reshape(X_new.shape[0], -1)

In [None]:
# @title Predict the classes for the new data
new_predictions = model.predict(X_new_reshaped)

In [None]:
# @title Decode numeric predictions back to string labels
new_predictions_decoded = label_encoder.inverse_transform(new_predictions)

In [None]:
# @title Display the predicted classes
print("Predicted Classes for df6:")
print(new_predictions_decoded)

In [None]:
type(new_predictions_decoded)

In [None]:
# @title Created new Mapping and replaced  it with Predictiions

mapping = {"boxing": 0, "drums": 1, "guitar": 2, "rowing": 3, "violin": 4}

# Create a DataFrame with the predicted classes
df = pd.DataFrame({'Predicted Classes': new_predictions_decoded})

# Replace values based on the mapping
df['Predicted Classes'] = df['Predicted Classes'].replace(mapping)


In [None]:
# @title Display the updated DataFram
new_predictions_decoded.shape

In [None]:
# @title Create a DataFrame for the submission file
submission_df = pd.DataFrame({'id': range(1, len(new_predictions) + 1), 'action': new_predictions})

In [None]:
submission_df

In [None]:
# @title Save the submission file
submission_df.to_csv('Submission_file.csv', index=False)