In [16]:
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import word2number
from word2number import w2n
# from pyspark.sql import SparkSession


In [17]:
inventory = pd.read_csv("C:/Users/Vedant/Desktop/Assignments-1/deliverables/inventory.csv", encoding='utf-8')

products = pd.read_csv(r"C:/Users/Vedant/Desktop/Assignments-1/deliverables/products.csv", encoding='utf-8')

In [22]:
customers = pd.read_json('C:/Users/Vedant/Desktop/Assignments-1/deliverables/customers.json')

In [18]:
sales_folder = "C:/Users/Vedant/Desktop/Assignments-1/deliverables/sales"
def read_file(file_path):
    try:
        return pd.read_excel(file_path)
    except Exception as e:
        print(f"Failed to read {file_path}: {e}")
        return pd.DataFrame()

def load_sales_parallel(folder_path, max_threads=8, limit=None):
    xlsx_files = sorted([
        os.path.join(folder_path, f)
        for f in os.listdir(folder_path)
        if f.endswith(".xlsx")
    ])
    
    if limit is not None:
        xlsx_files = xlsx_files[:limit]
    
    print(f"Found {len(xlsx_files)} Excel files to read with {max_threads} threads.")

    all_sales = []
    with ThreadPoolExecutor(max_threads) as executor:
        future_to_file = {executor.submit(read_file, f): f for f in xlsx_files}
        for i, future in enumerate(as_completed(future_to_file), 1):
            result = future.result()
            if not result.empty:
                all_sales.append(result)
            print(f"✅ Processed file {i}/{len(xlsx_files)}")

    combined_df = pd.concat(all_sales, ignore_index=True)
    print(f"\n📊 Total records combined: {len(combined_df)}")
    return combined_df

# 🔁 Example usage: Load only first 20 files
combined_sales_df = load_sales_parallel(sales_folder, max_threads=8, limit=2)

Found 2 Excel files to read with 8 threads.
✅ Processed file 1/2
✅ Processed file 2/2

📊 Total records combined: 140675


In [None]:
def clean_dataframe(df):
    # Clean column names
    df.columns = (
        df.columns.str.strip()              
                 .str.lower()               
                 .str.replace(" ", "_")     
                 .str.replace(r"[^\w_]", "", regex=True) 
    )
    # Clean string-type cells
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].str.strip().str.lower()
    
    return df


combined_sales_df = clean_dataframe(combined_sales_df)
inventory = clean_dataframe(inventory)
products = clean_dataframe(products)
customers = clean_dataframe(customers)

In [20]:

def convert_to_number(val):
    try:
        return int(val)
    except ValueError:
        try:
            return word2number.word_to_num(val)
        except:
            return None
    
def lvl1cleaning(df):
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    print(f"\nCONVERTING NUMBER WORDS TO INTEGER IN INT COL: {numeric_cols}")
    for col in numeric_cols:
        df[col] = df[col].apply(lambda x: convert_to_number(x) if isinstance(x, str) else x)

    df[numeric_cols] = df[numeric_cols].fillna(0)

    print("\n MISSING VALUES AFTER FILLING NAN AS 0")
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    if not missing.empty:
        print("COLUMNS WITH MISSING VALUES:")
        print(missing.sort_values(ascending=False))
    else:
        print("No missing values.")

        
    
def quick_eda(df, show_value_counts=False, value_count_limit=10):
    print(f"ROWS: {df.shape[0]}, COLUMNS: {df.shape[1]}")
    print(f"\n COLUMNS & TYPES : \n {df.dtypes}")
    
    print("\n MISSING VALUES")
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    if not missing.empty:
        print("COLUMNS WITH MISSING VALUES:")
        print(missing.sort_values(ascending=False))
    else:
        print("No missing values.")
    
    print(f"\n UNIQUE VALUES PER COLUMN \n {df.nunique()}")
    print(f"\n SAMPLE DATA \n {df.head()}")

    print("\n SUMMARY STATS (Numerical)")
    print(df.describe(include=[int, float]))
    
    print("\n SUMMARY STATS (Categorical)")
    print(df.describe(include=[object, "category"]))
    
    if show_value_counts:
        print("\n VALUE COUNTS")
        for col in df.columns:
            unique_vals = df[col].nunique()
            if unique_vals <= value_count_limit:
                print(f"\n{col} (top {value_count_limit} categories)")
                print(df[col].value_counts(dropna=False).head(value_count_limit))

In [26]:
# quick_eda(combined_sales_df, show_value_counts=True)
quick_eda(inventory, show_value_counts=True)
# quick_eda(products, show_value_counts=True)
# quick_eda(customers, show_value_counts=True)

ROWS: 15000, COLUMNS: 6

 COLUMNS & TYPES : 
 warehouse_id           object
product_id             object
stock_level            object
reorder_level           int64
avg_daily_sales         int64
days_until_reorder    float64
dtype: object

 MISSING VALUES
COLUMNS WITH MISSING VALUES:
stock_level    616
dtype: int64

 UNIQUE VALUES PER COLUMN 
 warehouse_id          15000
product_id              100
stock_level             293
reorder_level            81
avg_daily_sales          91
days_until_reorder       10
dtype: int64

 SAMPLE DATA 
   warehouse_id product_id stock_level  reorder_level  avg_daily_sales  \
0     w000_000    pid1000         ten             63               16   
1     w000_001    pid1001         138             45               79   
2     w000_002    pid1002          69             88               40   
3     w000_003    pid1003          42             95               25   
4     w000_004    pid1004         105             61               60   

   days_until_reo

In [25]:
lvl1cleaning(inventory)


CONVERTING NUMBER WORDS TO INTEGER IN INT COL: ['reorder_level', 'avg_daily_sales', 'days_until_reorder']

 MISSING VALUES AFTER FILLING NAN AS 0
COLUMNS WITH MISSING VALUES:
stock_level    616
dtype: int64
