# Sales Analysis Project
Analyze 12 months of sales data to answer key business questions.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os


## 1. Data Generation (Synthetic)
We will create 12 CSV files representing 12 months of sales data.


In [None]:

# Function to generate random sales data
def generate_sales_data(month):
    dates = pd.date_range(start=f'2023-{month:02d}-01', end=f'2023-{month:02d}-28', freq='h')
    products = ['iPhone', 'Google Pixel', 'Macbook Pro', 'ThinkPad', 'Lightning Cable', 'USB-C Cable', 'Headphones', 'Monitor']
    cities = ['New York', 'Los Angeles', 'Chicago', 'San Francisco', 'Boston', 'Austin']
    weights = [5, 2, 3, 4, 20, 15, 10, 5]
    
    n = len(dates)
    data = {
        'Order ID': np.arange(1000, 1000 + n),
        'Product': np.random.choice(products, n, p=[x/sum(weights) for x in weights]),
        'Quantity Ordered': np.random.randint(1, 5, n),
        'Price Each': np.random.uniform(10, 2000, n).round(2),
        'Order Date': dates,
        'Purchase Address': np.random.choice(cities, n)
    }
    return pd.DataFrame(data)

# Generate and save data
os.makedirs('sales_data', exist_ok=True)
for i in range(1, 13):
    df = generate_sales_data(i)
    df.to_csv(f'sales_data/Sales_Month_{i}.csv', index=False)
print("Sales data generated successfully!")


## 2. Merge Data


In [None]:
# Merge all 12 months of data into a single file
files = [file for file in os.listdir('./sales_data')]
all_months_data = pd.DataFrame()

for file in files:
    df = pd.read_csv('./sales_data/' + file)
    all_months_data = pd.concat([all_months_data, df])

all_months_data.to_csv("all_data.csv", index=False)
all_data = pd.read_csv("all_data.csv")
all_data.head()


## 3. Data Cleaning


In [None]:
# Cleaning steps (drop NaN, convert columns)
all_data = all_data.dropna(how='all')
all_data['Quantity Ordered'] = pd.to_numeric(all_data['Quantity Ordered'])
all_data['Price Each'] = pd.to_numeric(all_data['Price Each'])
all_data['Order Date'] = pd.to_datetime(all_data['Order Date'])


## 4. Augment Data with Additional Columns


In [None]:
# Add Month Column
all_data['Month'] = all_data['Order Date'].dt.month

# Add Sales Column
all_data['Sales'] = all_data['Quantity Ordered'] * all_data['Price Each']

# Add City Column
all_data['City'] = all_data['Purchase Address'].apply(lambda x: x.split(',')[0]) # Simplified
all_data.head()


## 5. Data Analysis (Question & Answer)


### Q1: What was the best month for sales?


In [None]:
results = all_data.groupby('Month').sum()
plt.bar(range(1, 13), results['Sales'])
plt.xticks(range(1, 13))
plt.ylabel('Sales in USD ($)')
plt.xlabel('Month number')
plt.show()


### Q2: What city had the highest number of sales?


In [None]:
results = all_data.groupby('City')['Sales'].sum()
cities = [city for city,df in all_data.groupby('City')]

plt.bar(cities, results)
plt.xticks(cities, rotation='vertical', size=8)
plt.ylabel('Sales in USD ($)')
plt.xlabel('City Name')
plt.show()


### Q3: What time should we display advertisements to maximize likelihood of customer's buying product?


In [None]:
all_data['Hour'] = all_data['Order Date'].dt.hour
all_data['Count'] = 1
keys = [pair for pair, df in all_data.groupby(['Hour'])]

plt.plot(keys, all_data.groupby(['Hour']).count()['Count'])
plt.xticks(keys)
plt.grid()
plt.show()
