In [5]:
import pandas as pd
import numpy as np

# Set a random seed for reproducibility
np.random.seed(42)

# Number of rows for the dataset
num_rows = 1000

# Generate synthetic data
data = {
    "Unit_Cost": np.random.uniform(10, 500, num_rows),  # Random float between 10 and 500
    "Total_Revenue": np.random.uniform(1000, 50000, num_rows),  # Random float between 1000 and 50000
    "Total_Profit": np.random.uniform(500, 20000, num_rows),  # Random float between 500 and 20000
    "Region": np.random.choice(
        ["North America", "Europe", "Asia", "Australia", "Africa", "South America"], num_rows
    ),
    "Country": np.random.choice(
        ["USA", "Canada", "UK", "Germany", "China", "India", "Australia", "Brazil", "South Africa"], num_rows
    ),
    "Item_Type": np.random.choice(
        ["Beverages", "Cereal", "Snacks", "Personal Care", "Clothes", "Fruits", "Household"], num_rows
    ),
    "Order_Priority": np.random.choice(["H", "M", "L"], num_rows),  # High, Medium, Low priority
    "Sales_Channel": np.random.choice(["Online", "Offline"], num_rows),  # Binary target variable
}

# Create a DataFrame
df = pd.DataFrame(data)

# Save the dataset as a CSV file
df.to_csv("synthetic_sales_data.csv", index=False)

# Display the first few rows
print(df.head())


    Unit_Cost  Total_Revenue  Total_Profit         Region    Country  \
0  193.524658   10071.513513   5603.260833           Asia     Brazil   
1  475.850010   27553.146422   5316.086582  North America         UK   
2  368.677031   43774.345958  18171.964320         Africa  Australia   
3  303.342657   36879.019434   5366.150897  South America        USA   
4   86.449134   40521.496245   5803.019660           Asia        USA   

  Item_Type Order_Priority Sales_Channel  
0    Cereal              M       Offline  
1    Snacks              H        Online  
2    Cereal              L       Offline  
3    Snacks              L        Online  
4    Fruits              H        Online  


In [6]:
df = pd.read_csv("synthetic_sales_data.csv")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unit_Cost       1000 non-null   float64
 1   Total_Revenue   1000 non-null   float64
 2   Total_Profit    1000 non-null   float64
 3   Region          1000 non-null   object 
 4   Country         1000 non-null   object 
 5   Item_Type       1000 non-null   object 
 6   Order_Priority  1000 non-null   object 
 7   Sales_Channel   1000 non-null   object 
dtypes: float64(3), object(5)
memory usage: 62.6+ KB
None


In [1]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load your dataset
df = pd.read_csv('/content/synthetic_sales_data.csv')

# Assume the target column is 'Sales_Channel'
y = df['Sales_Channel']  # Extract the target variable
X = df.drop('Sales_Channel', axis=1)  # Drop the target column to get features

# Encode the target variable if it is categorical
if y.dtypes == 'object':
    y = LabelEncoder().fit_transform(y)

# Convert categorical features in X to numerical using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Display the sizes of the train and test sets
print(f"Train set size: {len(X_train)}, Test set size: {len(X_test)}")


Train set size: 700, Test set size: 300


In [9]:
# Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Task 5: Train and Evaluate Machine Learning Models

# Logistic Regression
log_reg_model = LogisticRegression(random_state=42)
log_reg_model.fit(X_train, y_train)  # Train the model
log_reg_preds = log_reg_model.predict(X_test)  # Predict on test set

# Evaluate Logistic Regression
print("\nLogistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, log_reg_preds))
print("Classification Report:\n", classification_report(y_test, log_reg_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, log_reg_preds))

# Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)  # Train the model
rf_preds = rf_model.predict(X_test)  # Predict on test set

# Evaluate Random Forest
print("\nRandom Forest Results:")
print("Accuracy:", accuracy_score(y_test, rf_preds))
print("Classification Report:\n", classification_report(y_test, rf_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_preds))

# Comparison
print("\nModel Comparison:")
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, log_reg_preds):.2f}")
print(f"Random Forest Accuracy: {accuracy_score(y_test, rf_preds):.2f}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression Results:
Accuracy: 0.5833333333333334
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.62      0.60       152
           1       0.58      0.55      0.56       148

    accuracy                           0.58       300
   macro avg       0.58      0.58      0.58       300
weighted avg       0.58      0.58      0.58       300

Confusion Matrix:
 [[94 58]
 [67 81]]

Random Forest Results:
Accuracy: 0.5166666666666667
Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.62      0.57       152
           1       0.51      0.41      0.45       148

    accuracy                           0.52       300
   macro avg       0.52      0.52      0.51       300
weighted avg       0.52      0.52      0.51       300

Confusion Matrix:
 [[95 57]
 [88 60]]

Model Comparison:
Logistic Regression Accuracy: 0.58
Random Forest Accuracy: 0.52
