<a href="https://colab.research.google.com/github/yasminela/AI-ML/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np


# Path to your file (adjust folder path as needed)
file_path = '/content/drive/MyDrive/perfume(projet R)/cleaned_perfume.csv'

# Load the data
df = pd.read_csv(file_path)

In [None]:
# Check for duplicates
print(f"Initial shape: {df.shape}")
print(f"Number of duplicates: {df.duplicated().sum()}")

# Remove duplicates
df = df.drop_duplicates()
print(f"Shape after removing duplicates: {df.shape}")

Initial shape: (1000, 10)
Number of duplicates: 0
Shape after removing duplicates: (1000, 10)


In [None]:
# Check for missing values
print(df.isnull().sum())

# There is no "Revenue" column, but we'll check other columns
# For numerical columns, we'll fill with median
num_cols = ['price', 'available', 'sold']
for col in num_cols:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].median())

# For categorical columns, we'll fill with mode
cat_cols = ['brand', 'title', 'type', 'itemLocation']
for col in cat_cols:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].mode()[0])

brand                1
title                0
type                 0
price                0
priceWithCurrency    0
available            0
availableText        3
sold                 6
lastUpdated          0
itemLocation         3
dtype: int64


In [None]:
import re

def remove_emojis(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', str(text))

# Apply to text columns
text_cols = ['title', 'availableText', 'itemLocation']
for col in text_cols:
    df[col] = df[col].apply(remove_emojis)

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Normalize price, available, and sold columns
scaler = MinMaxScaler()
df[['price_norm', 'available_norm', 'sold_norm']] = scaler.fit_transform(df[['price', 'available', 'sold']])

In [None]:
# Clean brand names (standardize capitalization and remove extra spaces)
df['brand'] = df['brand'].str.strip().str.title()

# Clean itemLocation (standardize state names)
# This would need a more comprehensive mapping for all states
df['itemLocation'] = df['itemLocation'].str.replace(r',\s+', ', ', regex=True)

In [None]:
# Identify outliers in price using IQR
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Cap outliers
df['price'] = np.where(df['price'] > upper_bound, upper_bound,
                      np.where(df['price'] < lower_bound, lower_bound, df['price']))

In [None]:
# Convert lastUpdated to datetime
df['lastUpdated'] = pd.to_datetime(df['lastUpdated'], errors='coerce')

# Ensure numerical columns are numeric
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['available'] = pd.to_numeric(df['available'], errors='coerce')
df['sold'] = pd.to_numeric(df['sold'], errors='coerce')

In [None]:
# Select relevant columns for final output
cleaned_df = df[[
    'brand', 'title', 'price', 'price_norm', 'available', 'available_norm',
    'sold', 'sold_norm', 'itemLocation', 'lastUpdated', 'brand_encoded'
] + [col for col in df.columns if col.startswith('type_')]]



print("Data cleaning complete. Final shape:", cleaned_df.shape)

Data cleaning complete. Final shape: (1000, 70)


In [None]:
# Save the cleaned dataframe to a new CSV file
df.to_csv('cleaned.csv', index=False)

In [None]:
from google.colab import files
files.download('cleaned.csv')  # For Google Colab

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>