In [1]:
!pip install unzip

Collecting unzip
  Downloading unzip-1.0.0.tar.gz (704 bytes)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: unzip
  Building wheel for unzip (setup.py) ... [?25l[?25hdone
  Created wheel for unzip: filename=unzip-1.0.0-py3-none-any.whl size=1283 sha256=260a2b9f491965f2dc650beb2702b291f3968eb7838ff3fa1ebcf11927716065
  Stored in directory: /root/.cache/pip/wheels/80/dc/7a/f8af45bc239e7933509183f038ea8d46f3610aab82b35369f4
Successfully built unzip
Installing collected packages: unzip
Successfully installed unzip-1.0.0


In [2]:
import zipfile

with zipfile.ZipFile('/content/archive (8).zip', 'r') as zip_ref:
    zip_ref.extractall('extracted_data')

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the dataset
df = pd.read_csv('/content/extracted_data/100_Sales.csv')  # Replace with your file path if needed

# Step 2: Inspect the dataset
print("Dataset Info:")
print(df.info())
print("\nSample Data:")
print(df.head())

# Step 3: Drop unnecessary columns
# Drop any unnamed columns or columns with all NaN values
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Step 4: Encode categorical variables
# Encode the target variable 'Sales_Channel' (assuming it exists in the dataset)
if 'Sales_Channel' in df.columns:
    df['Sales_Channel'] = LabelEncoder().fit_transform(df['Sales_Channel'])  # 0 = Offline, 1 = Online

# Encode all other categorical columns using one-hot encoding
categorical_columns = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Step 5: Define features (X) and target (y)
if 'Sales_Channel' in df.columns:
    X = df.drop(['Sales_Channel'], axis=1)  # Features
    y = df['Sales_Channel']  # Target
else:
    raise ValueError("Target column 'Sales_Channel' not found in the dataset.")

# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f"Train set size: {len(X_train)}, Test set size: {len(X_test)}")

# Step 7: Train a Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_preds = dt_model.predict(X_test)

# Evaluate Decision Tree
print("\nDecision Tree Evaluation:")
print("Accuracy:", accuracy_score(y_test, dt_preds))
print("Classification Report:")
print(classification_report(y_test, dt_preds))

# Step 8: Train a Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

# Evaluate Random Forest
print("\nRandom Forest Evaluation:")
print("Accuracy:", accuracy_score(y_test, rf_preds))
print("Classification Report:")
print(classification_report(y_test, rf_preds))


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Region          100 non-null    object 
 1   Country         100 non-null    object 
 2   Item_Type       100 non-null    object 
 3   Sales_Channel   100 non-null    object 
 4   Order_Priority  100 non-null    object 
 5   Ship_Date       100 non-null    object 
 6   Unit_Cost       100 non-null    float64
 7   Total_Revenue   100 non-null    float64
 8   Total_Profit    100 non-null    float64
 9   Unnamed: 9      0 non-null      float64
 10  Unnamed: 10     0 non-null      float64
dtypes: float64(5), object(6)
memory usage: 8.7+ KB
None

Sample Data:
                              Region                Country        Item_Type  \
0              Australia and Oceania                 Tuvalu        Baby Food   
1  Central America and the Caribbean                Grenada       