In [1]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
# Step 1: Loading the dataset
data = pd.read_excel("AnomaData.xlsx")

In [3]:
# Step 2: Exploratory Data Analysis (EDA)
# Check data quality
print(data.info())
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18398 entries, 0 to 18397
Data columns (total 62 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   time    18398 non-null  datetime64[ns]
 1   y       18398 non-null  int64         
 2   x1      18398 non-null  float64       
 3   x2      18398 non-null  float64       
 4   x3      18398 non-null  float64       
 5   x4      18398 non-null  float64       
 6   x5      18398 non-null  float64       
 7   x6      18398 non-null  float64       
 8   x7      18398 non-null  float64       
 9   x8      18398 non-null  float64       
 10  x9      18398 non-null  float64       
 11  x10     18398 non-null  float64       
 12  x11     18398 non-null  float64       
 13  x12     18398 non-null  float64       
 14  x13     18398 non-null  float64       
 15  x14     18398 non-null  float64       
 16  x15     18398 non-null  float64       
 17  x16     18398 non-null  float64       
 18  x17   

In [4]:
# Treat missing values if any
data.dropna(inplace=True)

In [5]:
# Step 3: Correcting date datatype
data['time'] = pd.to_datetime(data['time'])

In [6]:
# Step 4: Feature Engineering and Selection
# Assuming no specific feature engineering required

In [7]:
# Step 5: Train/Test Split
X = data.drop(columns=['y', 'y.1'])
y = data['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Convert datetime to numerical representation
# Convert 'time' column to datetime if it's not already
X_train['time'] = pd.to_datetime(X_train['time'])
X_test['time'] = pd.to_datetime(X_test['time'])

# Conver t datetime to Unix timestamp in seconds
X_train['time'] = X_train['time'].astype('int64') // 10**9
X_test['time'] = X_test['time'].astype('int64') // 10**9


In [9]:
# Step 6: Model Selection
model = RandomForestClassifier()

In [10]:
# Step 7: Model Training
model.fit(X_train, y_train)

In [19]:
# Step 8: Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Accuracy: 0.997554347826087
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3658
           1       0.93      0.64      0.76        22

    accuracy                           1.00      3680
   macro avg       0.97      0.82      0.88      3680
weighted avg       1.00      1.00      1.00      3680



In [12]:
# Step 9: Hyperparameter Tuning/Model Improvement
param_grid = {'n_estimators': [100, 200, 300],
              'max_depth': [None, 10, 20],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4]}


In [13]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)



In [15]:
grid_search.fit(X_train, y_train)


In [16]:
best_params = grid_search.best_params_
print("Best parameters:", best_params)

Best parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
