In [1]:
import pandas as pd
import random

## Creating Aggregate Store Sales

In [2]:
# Read the CSV file into a DataFrame
train_df = pd.read_csv('train.csv')

# Convert the 'Date' column to datetime format if it's not already
train_df['Date'] = pd.to_datetime(train_df['Date'])

print(train_df.shape)
train_df.head()

(421570, 5)


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-03-05,21827.9,False


In [3]:
# Group by 'Store' and 'Date' and aggregate 'Weekly_Sales' using sum
agg_df = train_df.groupby(['Store', 'Date']).agg({'Weekly_Sales': 'sum'}).reset_index()

print(agg_df.shape)
agg_df.head()

(6435, 3)


Unnamed: 0,Store,Date,Weekly_Sales
0,1,2010-02-05,1643690.9
1,1,2010-02-12,1641957.44
2,1,2010-02-19,1611968.17
3,1,2010-02-26,1409727.59
4,1,2010-03-05,1554806.68


In [4]:
agg_df.to_csv('store_sales.csv', index=False)

In [5]:
test = pd.read_csv('store_sales.csv')
print(test.shape)
test.head()

(6435, 3)


Unnamed: 0,Store,Date,Weekly_Sales
0,1,2010-02-05,1643690.9
1,1,2010-02-12,1641957.44
2,1,2010-02-19,1611968.17
3,1,2010-02-26,1409727.59
4,1,2010-03-05,1554806.68


In [6]:
feature_df = pd.read_csv('store_conditions.csv')
feature_df['Date'] = pd.to_datetime(feature_df['Date'])
feature_df.shape

(8190, 12)

In [7]:
# Perform inner join on 'Date' column
merged_df = pd.merge(agg_df, feature_df, on=['Store', 'Date'], how='inner')

# Print the result
print(merged_df.shape)
merged_df.head()

(6435, 13)


Unnamed: 0,Store,Date,Weekly_Sales,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,2010-02-05,1643690.9,42.31,2.572,,,,,,211.096358,8.106,False
1,1,2010-02-12,1641957.44,38.51,2.548,,,,,,211.24217,8.106,True
2,1,2010-02-19,1611968.17,39.93,2.514,,,,,,211.289143,8.106,False
3,1,2010-02-26,1409727.59,46.63,2.561,,,,,,211.319643,8.106,False
4,1,2010-03-05,1554806.68,46.5,2.625,,,,,,211.350143,8.106,False


In [9]:
merged_df.to_csv('/Users/pavan/Desktop/Fall-23/ADT/ADT-Fall23-Group07-Walmart-Sales-Analysis/Application/combined_sales_data.csv', index=False)

## Creating Store Details Table

In [8]:
random.seed(16)

num_stores = 45
# Generate dummy data
store_numbers = list(range(1, num_stores + 1))
store_names = ["Walmart Store " + str(num) for num in store_numbers]


first_names = ["John", "Jane", "Robert", "Emily", "Michael"]
last_names = ["Doe", "Smith", "Johnson", "Davis", "Brown"]
managers = [f"{random.choice(first_names)} {random.choice(last_names)}" for _ in range(num_stores)]

city_names = ["New York", "Los Angeles", "Bloomington", "Chicago", "Houston", "Phoenix", "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Francisco"]

# Create a DataFrame
data = {
    'StoreNumber': store_numbers,
    'StoreName': store_names,
    'Manager': managers,
    'Location': [random.choice(city_names) for _ in range(num_stores)],
}

store_details_df = pd.DataFrame(data)

store_details_df.to_csv('store_details.csv', index=False)
store_details_df

Unnamed: 0,StoreNumber,StoreName,Manager,Location
0,1,Walmart Store 1,Robert Davis,San Diego
1,2,Walmart Store 2,Emily Johnson,Los Angeles
2,3,Walmart Store 3,Emily Smith,Chicago
3,4,Walmart Store 4,Emily Doe,New York
4,5,Walmart Store 5,Emily Johnson,Philadelphia
5,6,Walmart Store 6,Jane Smith,San Francisco
6,7,Walmart Store 7,John Johnson,New York
7,8,Walmart Store 8,Robert Johnson,Los Angeles
8,9,Walmart Store 9,Jane Brown,San Francisco
9,10,Walmart Store 10,Robert Doe,New York
