In [14]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler



In [4]:
# Load your CSV dataset
df = pd.read_csv('covtype_train.csv')

# Assuming your target variable is named 'target'
X = df.drop('Cover_Type', axis=1)
y = df['Cover_Type']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [19]:
from sklearn.model_selection import GridSearchCV
# Apply SMOTE to the data
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_smote)

# Create a logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# Define the hyperparameter grid for grid search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Regularization type
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_scaled, y_smote)

# Print the best hyperparameters found by grid search
print("Best Hyperparameters:", grid_search.best_params_)
print("Best CV Accuracy: {:.2f}".format(grid_search.best_score_))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_smote, test_size=0.2, random_state=42)

# Fit the model on the training data using the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Evaluate the model
report = classification_report(y_test, y_pred)

# Print the classification report
print("\nClassification Report:\n", report)





STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Hyperparameters: {'C': 10, 'penalty': 'l2'}
Best CV Accuracy: 0.72


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Classification Report:
               precision    recall  f1-score   support

           1       0.67      0.66      0.66      5647
           2       0.64      0.59      0.62      5639
           3       0.64      0.54      0.58      5599
           4       0.82      0.90      0.86      5647
           5       0.76      0.81      0.78      5573
           6       0.64      0.70      0.66      5758
           7       0.88      0.89      0.89      5685

    accuracy                           0.73     39548
   macro avg       0.72      0.73      0.72     39548
weighted avg       0.72      0.73      0.72     39548



In [23]:
df_test = pd.read_csv('covtype_test.csv')

X_test_existing_scaled = scaler.transform(df_test)

# Fit the model on the entire training data using the best hyperparameters
final_model = best_model  # Using the best model obtained from grid search
final_model.fit(X_train_smote_scaled, y_train_smote)

# Make predictions on the existing test data
y_pred_existing = final_model.predict(X_test_existing_scaled)

# Create a DataFrame with the predictions
predictions_df = pd.DataFrame({'Predicted_Target': y_pred_existing})

# Concatenate the predictions with the original test data
result_df = pd.concat([df_test, predictions_df], axis=1)

# Save the results to a CSV file
result_df.to_csv('multinomial_results.csv', index=False)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
df_test = pd.read_csv('covtype_test.csv')
X_test_existing = df_test


In [25]:
X_test_existing_scaled = scaler.transform(X_test_existing)

# Fit the model on the entire training data using the best hyperparameters
final_model = best_model

In [26]:
y_pred_existing = final_model.predict(X_test_existing_scaled)

In [28]:
custom_threshold = 0.03
mn_predictions = y_pred_existing.astype(int)

In [30]:
pd.DataFrame(mn_predictions).to_csv('multinomial.csv', 
                                               index=True, 
                                               header=["prediction"])