In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score

# Assuming you have your data in a DataFrame 'train' with columns 'timestamp', 'value', and 'TARGET'

# Prepare the feature matrix (X) and the target variable (y)
X = train[['value']]
y = train['TARGET']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a logistic regression model
model = LogisticRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Calculate precision and recall
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Precision: {precision}")
print(f"Recall: {recall}")

print(f"P&R Score: {(precision+recall)/2}")


In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [28]:
# Assuming you have your data in a DataFrame 'train' with columns 'timestamp', 'value', and 'TARGET'

# Convert the 'timestamp' column to datetime type
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])


# Prepare the feature matrix (X) and the target variable (y)
X_train = train['value']  # Include the timestamp features
y_train = train['TARGET']
X_test = test['value']  # Include the timestamp features

# Split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a logistic regression model
model = LogisticRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
#accuracy = accuracy_score(y_test, y_pred)
#confusion = confusion_matrix(y_test, y_pred)
#classification_rep = classification_report(y_test, y_pred)

# Calculate precision and recall
#precision = precision_score(y_test, y_pred)
#recall = recall_score(y_test, y_pred)

#print(f"Precision: {precision}")
#print(f"Recall: {recall}")

#print(f"P&R Score: {(precision+recall)/2}")

y_pred = pd.DataFrame(y_pred)

print(submission)

submission['TARGET'] = y_pred
submission = submission.reset_index(drop=True)

submission.to_csv('submission_2.csv', index=False)

          ID  TARGET
0      50000       0
1      50001       0
2      50002       0
3      50003       0
4      50004       0
...      ...     ...
49995  99995       0
49996  99996       0
49997  99997       0
49998  99998       0
49999  99999       0

[50000 rows x 2 columns]


In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

# Assuming you have your data in a DataFrame 'train' with columns 'timestamp', 'value', and 'TARGET'

# Convert the 'timestamp' column to datetime type
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

# Extract timestamp features
train['hour'] = train['timestamp'].dt.hour
train['dayofweek'] = train['timestamp'].dt.dayofweek
train['month'] = train['timestamp'].dt.month

test['hour'] = test['timestamp'].dt.hour
test['dayofweek'] = test['timestamp'].dt.dayofweek
test['month'] = test['timestamp'].dt.month

# Prepare the feature matrix (X) and the target variable (y)
X_train = train[['value', 'hour', 'dayofweek', 'month']]  # Include the timestamp features
y_train = train['TARGET']
X_test = test[['value', 'hour', 'dayofweek', 'month']]  # Include the timestamp features

# Split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a K-Nearest Neighbors model with a specified number of neighbors (e.g., n_neighbors=5)
model = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors as needed

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
#accuracy = accuracy_score(y_test, y_pred)
#confusion = confusion_matrix(y_test, y_pred)
#classification_rep = classification_report(y_test, y_pred)

# Calculate precision and recall
#precision = precision_score(y_test, y_pred)
#recall = recall_score(y_test, y_pred)

#print(f"Precision: {precision}")
#print(f"Recall: {recall}")

#print(f"P&R Score: {(precision+recall)/2}")

y_pred = pd.DataFrame(y_pred)

print(submission)

submission['TARGET'] = y_pred
submission = submission.reset_index(drop=True)

submission.to_csv('submission_knn.csv', index=False)


          ID  TARGET
0      50000       0
1      50001       0
2      50002       0
3      50003       0
4      50004       0
...      ...     ...
49995  99995       0
49996  99996       0
49997  99997       0
49998  99998       0
49999  99999       0

[50000 rows x 2 columns]


In [37]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import LocalOutlierFactor

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

# Convert the 'timestamp' column to datetime type
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

# Extract timestamp features
train['hour'] = train['timestamp'].dt.hour
train['dayofweek'] = train['timestamp'].dt.dayofweek
train['month'] = train['timestamp'].dt.month

test['hour'] = test['timestamp'].dt.hour
test['dayofweek'] = test['timestamp'].dt.dayofweek
test['month'] = test['timestamp'].dt.month

# Prepare the feature matrix (X) and the target variable (y)
X_train = train[['value', 'hour', 'dayofweek', 'month']]  # Include the timestamp features
y_train = train['TARGET']
X_test = test[['value', 'hour', 'dayofweek', 'month']]  # Include the timestamp features

# Add Local Outlier Factor (LOF) scores as features
lof_model = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
lof_scores = lof_model.fit_predict(X_train)
X_train['LOF_scores'] = lof_scores  # Include LOF scores in the training data
X_test_with_lof = X_test.copy()  # Create a copy of the test data
lof_scores_test = lof_model.fit_predict(X_test)
X_test_with_lof['LOF_scores'] = lof_scores_test  # Include LOF scores in the test data

# Create a Random Forest model
model = RandomForestClassifier(n_estimators=100, max_depth = 10, criterion= 'entropy', random_state=42)  # You can adjust the number of estimators as needed

# Train the model on the training data with LOF scores
model.fit(X_train, y_train)

# Make predictions on the test data with LOF scores
y_pred = model.predict(X_test_with_lof)

# Optionally, you can evaluate the model's performance using various metrics such as accuracy.
# accuracy = accuracy_score(y_test, y_pred)

# Save the predictions to a CSV file
submission['TARGET'] = y_pred
submission.to_csv('submission_rf_lof.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['LOF_scores'] = lof_scores  # Include LOF scores in the training data


In [40]:
import pandas as pd
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import LocalOutlierFactor

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

# Convert the 'timestamp' column to datetime type
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

# Extract timestamp features
train['hour'] = train['timestamp'].dt.hour
train['dayofweek'] = train['timestamp'].dt.dayofweek
train['month'] = train['timestamp'].dt.month

test['hour'] = test['timestamp'].dt.hour
test['dayofweek'] = test['timestamp'].dt.dayofweek
test['month'] = test['timestamp'].dt.month

# Identify the missing dates between 6.02 and 6.04
missing_dates = pd.date_range(start='2023-06-02', end='2023-06-04', freq='T')

# Create an Exponential Smoothing model to forecast missing values
forecast_model = ExponentialSmoothing(train['value'], seasonal='add', seasonal_periods=13)  # Adjust the seasonal parameters
forecast_model = forecast_model.fit()

# Fill in the missing values in the training data for the specified dates
for date in missing_dates:
    forecasted_value = forecast_model.forecast(1).iloc[0]
    train.loc[train['timestamp'] == date, 'value'] = forecasted_value

# Prepare the feature matrix (X) and the target variable (y)
X_train = train[['value', 'hour', 'dayofweek', 'month']]
y_train = train['TARGET']
X_test = test[['value', 'hour', 'dayofweek', 'month']]

# Add Local Outlier Factor (LOF) scores as features
lof_model = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
lof_scores = lof_model.fit_predict(X_train)
X_train['LOF_scores'] = lof_scores  # Include LOF scores in the training data
X_test_with_lof = X_test.copy()  # Create a copy of the test data
lof_scores_test = lof_model.fit_predict(X_test)
X_test_with_lof['LOF_scores'] = lof_scores_test  # Include LOF scores in the test data

# Create a Random Forest model
model = RandomForestClassifier(n_estimators=200,  # Increase the number of trees
    max_depth=10,     # Limit the depth of trees to avoid overfitting
    min_samples_split=2,  # Adjust the minimum samples required to split a node
    min_samples_leaf=1,   # Adjust the minimum samples in a leaf node
    max_features='sqrt',  # Use square root of the number of features for splits
    criterion='gini',     # Measure quality of split using Gini impurity
    random_state=42,      # Set a random seed for reproducibility
    class_weight='balanced',  # Assign weights to classes for imbalanced datasets
    bootstrap=True,       # Use bootstrapping to sample data
    oob_score=True        # Calculate OOB score)  # You can adjust the number of estimators as needed)
                              )
# Train the model on the training data with LOF scores
model.fit(X_train, y_train)

# Make predictions on the test data with LOF scores
y_pred = model.predict(X_test_with_lof)

# Optionally, you can evaluate the model's performance using various metrics such as accuracy.
# accuracy = accuracy_score(y_test, y_pred)

# Save the predictions to a CSV file
submission['TARGET'] = y_pred
submission.to_csv('submission_rf_lof_stl.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['LOF_scores'] = lof_scores  # Include LOF scores in the training data


In [41]:
import pandas as pd
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.neighbors import LocalOutlierFactor
from scipy import stats

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

# Convert the 'timestamp' column to datetime type
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

# Extract timestamp features
train['hour'] = train['timestamp'].dt.hour
train['dayofweek'] = train['timestamp'].dt.dayofweek
train['month'] = train['timestamp'].dt.month

test['hour'] = test['timestamp'].dt.hour
test['dayofweek'] = test['timestamp'].dt.dayofweek
test['month'] = test['timestamp'].dt.month

# Identify the missing dates between 6.02 and 6.04
missing_dates = pd.date_range(start='2023-06-02', end='2023-06-04', freq='T')

# Create an Exponential Smoothing model to forecast missing values
forecast_model = ExponentialSmoothing(train['value'], seasonal='add', seasonal_periods=13)  # Adjust the seasonal parameters
forecast_model = forecast_model.fit()

# Fill in the missing values in the training data for the specified dates
for date in missing_dates:
    forecasted_value = forecast_model.forecast(1).iloc[0]
    train.loc[train['timestamp'] == date, 'value'] = forecasted_value

# Calculate Z-Scores and flag anomalies
train['z_score'] = stats.zscore(train['value'])
threshold = 2  # You can adjust the threshold as needed
train['is_anomaly'] = train['z_score'].apply(lambda x: 1 if abs(x) > threshold else 0)

# Prepare the feature matrix (X) and the target variable (y)
X_train = train[['value', 'hour', 'dayofweek', 'month', 'is_anomaly']]
y_train = train['TARGET']
X_test = test[['value', 'hour', 'dayofweek', 'month']]

# Add Local Outlier Factor (LOF) scores as features
lof_model = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
lof_scores = lof_model.fit_predict(X_train)
X_train['LOF_scores'] = lof_scores  # Include LOF scores in the training data
X_test_with_lof = X_test.copy()  # Create a copy of the test data
lof_scores_test = lof_model.fit_predict(X_test)
X_test_with_lof['LOF_scores'] = lof_scores_test  # Include LOF scores in the test data

# Create a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust the number of estimators as needed

# Train the model on the training data with LOF scores
model.fit(X_train, y_train)

# Make predictions on the test data with LOF scores
y_pred = model.predict(X_test_with_lof)

# Optionally, you can evaluate the model's performance using various metrics such as accuracy.
# accuracy = accuracy_score(y_test, y_pred)

# Save the predictions to a CSV file
submission['TARGET'] = y_pred
submission.to_csv('submission_rf_lof_zscore.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['LOF_scores'] = lof_scores  # Include LOF scores in the training data


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- is_anomaly


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

# Convert the 'timestamp' column to datetime type
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

# Extract timestamp features
train['hour'] = train['timestamp'].dt.hour
train['dayofweek'] = train['timestamp'].dt.dayofweek
train['month'] = train['timestamp'].dt.month

test['hour'] = test['timestamp'].dt.hour
test['dayofweek'] = test['timestamp'].dt.dayofweek
test['month'] = test['timestamp'].dt.month

# Prepare the feature matrix (X) and the target variable (y)
X_train = train[['value', 'hour', 'dayofweek', 'month']]  # Include the timestamp features
y_train = train['TARGET']
X_test = test[['value', 'hour', 'dayofweek', 'month']]  # Include the timestamp features

# Create a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust the number of estimators as needed

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
# accuracy = accuracy_score(y_test, y_pred)
# confusion = confusion_matrix(y_test, y_pred)
# classification_rep = classification_report(y_test, y_pred)

# Optionally, you can calculate precision and recall
# precision = precision_score(y_test, y_pred)
# recall = recall_score(y_test, y_pred)

# print(f"Precision: {precision}")
# print(f"Recall: {recall}")

# print(f"P&R Score: {(precision+recall)/2}")

y_pred = pd.DataFrame(y_pred)

print(submission)

submission['TARGET'] = y_pred
submission = submission.reset_index(drop=True)

submission.to_csv('submission_rf1.csv', index=False)


In [45]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

# Convert the 'timestamp' column to datetime type
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

# Prepare the feature matrix (X) and the target variable (y)
X_train = train[['value']]  # Use 'value' as the feature
y_train = train['TARGET']
X_test = test[['value']]  # Use 'value' as the feature

# Create a Random Forest model
model = RandomForestClassifier(n_estimators=200, random_state=42)  # You can adjust the number of estimators as needed

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Optionally, you can evaluate the model's performance using various metrics such as accuracy.
# accuracy = accuracy_score(y_test, y_pred)
# confusion = confusion_matrix(y_test, y_pred)
# classification_rep = classification_report(y_test, y_pred)

# Calculate precision and recall
# precision = precision_score(y_test, y_pred)
# recall = recall_score(y_test, y_pred)

# print(f"Precision: {precision}")
# print(f"Recall: {recall}")

# print(f"P&R Score: {(precision+recall)/2}")

y_pred = pd.DataFrame(y_pred)

print(submission)

submission['TARGET'] = y_pred
submission = submission.reset_index(drop=True)

submission.to_csv('submission_rf2.csv', index=False)


          ID  TARGET
0      50000       0
1      50001       0
2      50002       0
3      50003       0
4      50004       0
...      ...     ...
49995  99995       0
49996  99996       0
49997  99997       0
49998  99998       0
49999  99999       0

[50000 rows x 2 columns]
