In [1]:
import pandas as pd

# Load the data from the provided Excel file
file_path = 'Copy of AnomaData.xlsx'
data = pd.read_excel(file_path)

# Display the dataframe 
data.head()


Unnamed: 0,time,y,x1,x2,x3,x4,x5,x6,x7,x8,...,x51,x52,x54,x55,x56,x57,x58,x59,x60,y.1
0,1999-05-01 00:00:00,0,0.376665,-4.596435,-4.095756,13.497687,-0.11883,-20.669883,0.000732,-0.061114,...,29.984624,10.091721,-4.936434,-24.590146,18.515436,3.4734,0.033444,0.953219,0.006076,0
1,1999-05-01 00:02:00,0,0.47572,-4.542502,-4.018359,16.230659,-0.128733,-18.758079,0.000732,-0.061114,...,29.984624,10.095871,-4.937179,-32.413266,22.760065,2.682933,0.033536,1.090502,0.006083,0
2,1999-05-01 00:04:00,0,0.363848,-4.681394,-4.353147,14.127997,-0.138636,-17.836632,0.010803,-0.061114,...,29.984624,10.100265,-4.937924,-34.183774,27.004663,3.537487,0.033629,1.84054,0.00609,0
3,1999-05-01 00:06:00,0,0.30159,-4.758934,-4.023612,13.161566,-0.148142,-18.517601,0.002075,-0.061114,...,29.984624,10.10466,-4.938669,-35.954281,21.672449,3.986095,0.033721,2.55488,0.006097,0
4,1999-05-01 00:08:00,0,0.265578,-4.749928,-4.33315,15.26734,-0.155314,-17.505913,0.000732,-0.061114,...,29.984624,10.109054,-4.939414,-37.724789,21.907251,3.601573,0.033777,1.410494,0.006105,0


In [2]:
# Data inspection for missing values, data types, and anomalies

# Summary of missing values per column
missing_values = data.isnull().sum()

# Data types of each column
data_types = data.dtypes

# Descriptive statistics to identify any obvious anomalies
descriptive_stats = data.describe()

missing_values, data_types, descriptive_stats


(time    0
 y       0
 x1      0
 x2      0
 x3      0
        ..
 x57     0
 x58     0
 x59     0
 x60     0
 y.1     0
 Length: 62, dtype: int64,
 time    datetime64[ns]
 y                int64
 x1             float64
 x2             float64
 x3             float64
              ...      
 x57            float64
 x58            float64
 x59            float64
 x60            float64
 y.1              int64
 Length: 62, dtype: object,
                                 time             y            x1  \
 count                          18398  18398.000000  18398.000000   
 mean   1999-05-15 01:20:42.728557312      0.006740      0.011824   
 min              1999-05-01 00:00:00      0.000000     -3.787279   
 25%              1999-05-08 03:36:30      0.000000     -0.405681   
 50%              1999-05-14 18:39:00      0.000000      0.128245   
 75%              1999-05-22 06:01:30      0.000000      0.421222   
 max              1999-05-29 00:06:00      1.000000      3.054156   
 std    

In [3]:
# Extract year, month, day, and weekday from the 'time' column
data['year'] = data['time'].dt.year
data['month'] = data['time'].dt.month
data['day'] = data['time'].dt.day
data['weekday'] = data['time'].dt.weekday  # Monday=0, Sunday=6

# Calculate correlation matrix
correlation_matrix = data.corr()

# Extract correlations with the target variable 'y'
target_correlation = correlation_matrix['y'].sort_values(ascending=False)

# Display correlations with the target variable
print(target_correlation)


y        1.000000
y.1      0.390321
x15      0.058647
x42      0.034149
x9       0.024217
           ...   
x2      -0.090961
x3      -0.116369
x19     -0.140868
year          NaN
month         NaN
Name: y, Length: 66, dtype: float64


In [4]:
from sklearn.model_selection import train_test_split

# First, we'll drop the 'time' column and any other non-feature columns
# Assuming 'y' is the target variable and the rest are features
X = data.drop(['time', 'y'], axis=1)  # Adjust the drop list as per your dataset
y = data['y']

# Split the data into training and testing sets
# We'll use 80% of the data for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Output the shapes of the resulting splits to verify
print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Testing labels shape: {y_test.shape}")


Training features shape: (14718, 64)
Testing features shape: (3680, 64)
Training labels shape: (14718,)
Testing labels shape: (3680,)


In [5]:
from sklearn.linear_model import LogisticRegression

# Initialize the logistic regression model
model = LogisticRegression(max_iter=1000)  # Increase max_iter if convergence issues arise

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])  # Use predict_proba for ROC-AUC

# Print the evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"ROC-AUC Score: {roc_auc}")


Accuracy: 0.9951086956521739
Precision: 0.6666666666666666
Recall: 0.36363636363636365
F1 Score: 0.4705882352941177
ROC-AUC Score: 0.9021695909339431


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'penalty': ['l1', 'l2'],  # l1 is Lasso, l2 is Ridge
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Inverse of regularization strength
    'solver': ['liblinear', 'saga']  # Algorithms that support both l1 and l2 penalties
}

# Initialize the logistic regression model
logistic_model = LogisticRegression(max_iter=1000)

# Initialize the Grid Search with cross-validation
grid_search = GridSearchCV(estimator=logistic_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit the Grid Search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best Score: 0.9947683754007297


In [9]:
from sklearn.linear_model import LogisticRegression
import joblib

# Initialize the logistic regression model with the best parameters
best_model = LogisticRegression(C=100, penalty='l1', solver='liblinear', max_iter=1000)

# Assuming you already have your training data in X_train and y_train
best_model.fit(X_train, y_train)

# Save the model to a file
model_filename = 'trained_model.joblib'
joblib.dump(best_model, model_filename)



['trained_model.joblib']

In [10]:
import joblib

# Assuming your trained model is stored in a variable named best_model or final_model
# Replace `best_model` with the actual variable name of your trained model
model_filename = 'trained_model.joblib'
joblib.dump(best_model, model_filename)

# Confirmation message
print(f"Model saved to {model_filename}")


Model saved to trained_model.joblib


SyntaxError: invalid syntax (757958164.py, line 1)