In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
# Load feature names
features = pd.read_csv('UCI HAR Dataset/features.txt', delim_whitespace=True, header=None, names=['index', 'feature_name'])

# Deduplicate feature names by appending a unique index to duplicates
feature_names = pd.Series(features['feature_name']).apply(lambda x: x.strip()).tolist()
unique_feature_names = pd.Index([f"{name}_{i}" if feature_names.count(name) > 1 else name for i, name in enumerate(feature_names)])

# Load activity labels
activity_labels = pd.read_csv('UCI HAR Dataset/activity_labels.txt', delim_whitespace=True, header=None, names=['index', 'activity_name'])
activity_labels_dict = dict(zip(activity_labels['index'], activity_labels['activity_name']))

# Load training data
X_train = pd.read_csv('UCI HAR Dataset/train/X_train.txt', delim_whitespace=True, header=None, names=unique_feature_names)
y_train = pd.read_csv('UCI HAR Dataset/train/y_train.txt', header=None, names=['Activity'])
subject_train = pd.read_csv('UCI HAR Dataset/train/subject_train.txt', header=None, names=['Subject'])

# Load test data
X_test = pd.read_csv('UCI HAR Dataset/test/X_test.txt', delim_whitespace=True, header=None, names=unique_feature_names)
y_test = pd.read_csv('UCI HAR Dataset/test/y_test.txt', header=None, names=['Activity'])
subject_test = pd.read_csv('UCI HAR Dataset/test/subject_test.txt', header=None, names=['Subject'])

# Mapping activity labels to their names
y_train['Activity'] = y_train['Activity'].map(activity_labels_dict)
y_test['Activity'] = y_test['Activity'].map(activity_labels_dict)

# Display the first few rows to verify
X_train.head()

# Combine subject, activity, and features into a single DataFrame
train_data = pd.concat([subject_train, y_train, X_train], axis=1)
test_data = pd.concat([subject_test, y_test, X_test], axis=1)

# Merge train and test data
data = pd.concat([train_data, test_data])

# Display the first few rows
data.head()




  features = pd.read_csv('UCI HAR Dataset/features.txt', delim_whitespace=True, header=None, names=['index', 'feature_name'])
  activity_labels = pd.read_csv('UCI HAR Dataset/activity_labels.txt', delim_whitespace=True, header=None, names=['index', 'activity_name'])
  X_train = pd.read_csv('UCI HAR Dataset/train/X_train.txt', delim_whitespace=True, header=None, names=unique_feature_names)
  X_test = pd.read_csv('UCI HAR Dataset/test/X_test.txt', delim_whitespace=True, header=None, names=unique_feature_names)


Unnamed: 0,Subject,Activity,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
0,1,STANDING,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,...,-0.074323,-0.298676,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627
1,1,STANDING,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,...,0.158075,-0.595051,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317
2,1,STANDING,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,...,0.414503,-0.390748,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118
3,1,STANDING,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,...,0.404573,-0.11729,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663
4,1,STANDING,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,...,0.087753,-0.351471,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892


In [3]:


# Separate features and target labels
X = data.drop(['Subject', 'Activity'], axis=1)
y = data['Activity']

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)


In [4]:
# Step 10: Initialize and train the Random Forest model
rf = RandomForestClassifier(random_state=12)

# Train the model on the training data
rf.fit(X_train, y_train)

# Step 11: Predict on the test data
y_pred = rf.predict(X_test)

# Step 12: Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Step 13: Classification report for more detailed performance metrics
print("Classification Report (Baseline Model):")
print(classification_report(y_test, y_pred))

Accuracy: 0.9786
Classification Report (Baseline Model):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       389
           1       0.97      0.96      0.96       356
           2       0.97      0.97      0.97       381
           3       1.00      0.98      0.99       344
           4       0.98      0.96      0.97       281
           5       0.96      0.99      0.98       309

    accuracy                           0.98      2060
   macro avg       0.98      0.98      0.98      2060
weighted avg       0.98      0.98      0.98      2060



In [11]:

# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [ 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
}

# Initialize the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           scoring='accuracy', cv=5, verbose=1, n_jobs=-1)

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Get the best estimator after tuning
best_rf = grid_search.best_estimator_

# Predict using the tuned model
y_pred_best = best_rf.predict(X_test)

# Evaluate the tuned model
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"Accuracy after tuning: {accuracy_best:.4f}")

# Print classification report
print("Classification Report (Tuned Model):")
print(classification_report(y_test, y_pred_best))

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Accuracy after tuning: 0.9791
Classification Report (Tuned Model):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       389
           1       0.97      0.97      0.97       356
           2       0.97      0.97      0.97       381
           3       0.99      0.98      0.99       344
           4       0.99      0.96      0.97       281
           5       0.95      0.99      0.97       309

    accuracy                           0.98      2060
   macro avg       0.98      0.98      0.98      2060
weighted avg       0.98      0.98      0.98      2060

