In [1]:
# Step into our cozy coding haven, where each line of code wraps us in a comforting embrace.

# Ah, the 'import pandas as pd' line ushers in the mighty pandas library, like opening the door to a trusted friend.
# It's as if we're welcoming an esteemed guest into our cozy home, ready to embark on a data-driven adventure.

# This line of code imports the pandas library and aliases it as pd.
# It's like inviting a trusted friend into our coding sanctuary, someone who's got our back when it comes to handling data.
import pandas as pd


In [2]:
# Now, with the 'df = pd.read_csv('../MonthlySales.csv')' line, we embark on a journey through the data realms.
# It's like opening a treasure chest filled with precious information, ready to be explored and analyzed.

# This line of code reads a CSV file named 'MonthlySales.csv' and stores its contents in a DataFrame called df.
# It's like discovering a treasure chest full of valuable data, waiting to be uncovered and analyzed.
df = pd.read_csv('../MonthlySales.csv')


In [3]:
# Ah, the 'df' variable holds the key to our data treasure.
# It's like a map guiding us through the vast expanse of our data landscape, ready to reveal its secrets.

# This 'df' variable represents a DataFrame that contains our data treasure.
# It's like possessing a map that guides us through the vast expanse of our data landscape, revealing its secrets and insights.
df


Unnamed: 0,month,sales
0,2013-01-01,14236.9
1,2013-02-01,4519.89
2,2013-03-01,55691.01
3,2013-04-01,28295.35
4,2013-05-01,23648.29
5,2013-06-01,34595.13
6,2013-07-01,33946.39
7,2013-08-01,27909.47
8,2013-09-01,81777.35
9,2013-10-01,31453.39


In [4]:
# Yo, we 'bout to dive deep into the data game, so we bringin' in the JSON crew.
# It's like hittin' up the data wizards to put some magic on our files, you dig?

# This 'import json' line brings in the JSON crew, allowing us to work with JSON data.
# It's like calling in the data wizards to work their magic on our files and unlock their secrets.

import json

# Check it, we're bringin' in the JSON normalization squad straight outta the pandas block.
# It's like havin' the data SWAT team on speed dial, ready to handle any format like pros.

# This 'from pandas import json_normalize' line imports the json_normalize function from the pandas library.
# It's like having the data SWAT team on standby, ready to normalize our JSON data into a structured format.

from pandas import json_normalize

# A'ight, we crack open the JSON file and load it up.
# It's like poppin' the lock on the front door to our data crib and lettin' the numbers flow in for a chill session.

# This 'with open('../MonthlySalesByCategory.json') as json_data' line opens the JSON file for reading.
# It's like unlocking the front door to our data crib and welcoming the numbers inside for a chill session.
with open('../MonthlySalesByCategory.json') as json_data:
    # This 'd = json.load(json_data)' line loads the JSON data from the file into a variable called 'd'.
    # It's like inviting the numbers inside our data crib, ready to be analyzed and explored.
    d = json.load(json_data)


In [5]:
# A'ight, we 'bout to lay down some serious data moves, so we callin' up the JSON normalization squad.
# It's like bringin' in the heavy hitters to unpack our data stash, you feel me?

# This 'json_normalize' function call normalizes our JSON data into a structured DataFrame.
# It's like bringing in the heavy hitters to unpack and organize our data stash, making it ready for analysis.
# The parameters specify the structure of the JSON data, with 'monthlySales' being the nested key to normalize,
# and ['category', 'region'] indicating additional keys to include as columns in the DataFrame.
df = json_normalize(d['contents'], 'monthlySales', ['category', 'region'])

# Check it, now we got our data all nice and normalized, ready to rock and roll.
# It's like havin' a fresh set of blueprints for our data mansion, laid out and ready to build.

# This line displays the DataFrame 'df', showing the normalized data.
# It's like admiring the fresh set of blueprints for our data mansion, envisioning the insights and discoveries that lie ahead.
df


Unnamed: 0,month,sales,category,region
0,20130101,38,Furniture,West
1,20130201,35,Furniture,West
2,20130301,41,Furniture,West
3,20130401,55,Furniture,West
4,20130501,58,Furniture,West
5,20130601,66,Furniture,West
6,20130701,74,Furniture,West
7,20130801,78,Furniture,West
8,20130901,38,Furniture,West
9,20131001,30,Furniture,West


In [6]:
# A'ight, we 'bout to bring some data flavor into our crib, so we callin' up the parquet pandas.
# It's like invitin' a squad of data architects to design the blueprint for our data mansion.

# This 'import pyarrow.parquet as pq' line brings in the parquet pandas library under the alias 'pq'.
# It's like inviting a squad of data architects to our data mansion, ready to design the blueprint in Parquet format.
import pyarrow.parquet as pq


In [7]:
# Yo, we bringin' in the data masterpiece straight into our crib.
# It's like havin' the blueprint for our data mansion delivered right to our doorstep.

# This 'pq.read_table' function call reads a Parquet file and returns a Table object.
# It's like having the blueprint for our data mansion delivered right to our doorstep,
# ready for us to explore and analyze.
table = pq.read_table('../MonthlyProductSales.parquet')


In [8]:
# Yo, we ain't just stoppin' at the blueprint, we turnin' it into a livable space right in our crib.
# It's like takin' them data blueprints and buildin' our data mansion from the ground up.

# This 'table.to_pandas()' function call converts the Parquet Table object into a pandas DataFrame.
# It's like turning the data blueprints into a livable space in our data mansion,
# allowing us to explore and analyze the data more comfortably.
table.to_pandas()


Unnamed: 0,Month of Order Date,Product Name,Sales
0,b'2017-01-13',b'Xerox 1972',11
1,b'2017-01-13',b'Xerox 1923',37
2,b'2017-01-13',b'Xerox 225',19
3,b'2017-01-13',b'Xerox 195',40
4,b'2017-01-13',b'Wireless Extenders zBoost YX545 SOHO Signal ...,756
...,...,...,...
9247,b'2017-12-16',b'14-7/8 x 11 Blue Bar Computer Printout Paper',96
9248,b'2017-12-16',"b'6"" Cubicle Wall Clock, Black'",10
9249,b'2017-12-16',b'3M Hangers With Command Adhesive',10
9250,b'2017-12-16',b'3.6 Cubic Foot Counter Height Office Refrige...,295


In [9]:
#!pip install html5lib

In [10]:
# Yo, listen up! If you ain't got html5lib installed, you better go get it.
# Just hit up the pip and drop that install command like it's hot, you know what I'm sayin'?

# Check it, we 'bout to raid the data vault straight from a HTML site.
# It's like sneakin' into the backdoor of our data mansion and grabbin' all them state abbreviations.

# This line of code reads HTML content from the specified URL and parses it to extract tabular data.
# pd.read_html() is a pandas function specifically designed to scrape HTML tables and return them as a list of DataFrames.
# The URL provided leads to a Wikipedia page listing U.S. state abbreviations, so we're scraping that data.
# If you don't have the html5lib library installed, you might encounter issues with parsing the HTML content.
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_U.S._state_abbreviations')


In [11]:
# Yo, check it! We're grabbin' the juicy part of our data stash, straight outta the HTML grab.
# It's like sifting through the treasure chest in our data mansion and snaggin' the golden nuggets.

# This line of code selects the first element (index 0) from the list of DataFrames obtained from reading the HTML page.
# When we read HTML using pd.read_html(), it returns a list of DataFrames, each representing a table found on the page.
# We're specifically interested in the first table, which contains the states and their abbreviations.
df_usa = df[0]


In [12]:
# Alright, let's get down to business and clean up our data crib.

# This line of code drops the first 11 rows from our DataFrame df_usa.
# It's like sweeping away the clutter at the entrance of our data mansion, getting rid of any unnecessary information.
# These rows typically contain header information or introductory text that we don't need for our analysis.
df_usa_cleaned = df_usa.drop(df_usa.index[range(0, 11)])

# Now, we drop columns 10 to 14 (inclusive) from our DataFrame df_usa_cleaned.
# It's like renovating our data mansion's kitchen, getting rid of outdated appliances and unnecessary clutter.
# These columns likely contain additional information that we don't need for our analysis, so we're removing them.
final_df = df_usa_cleaned.drop(df_usa_cleaned.columns[10:15], axis=1)


In [13]:
# Time to give our columns some proper names and organize our data mansion.

# This line of code renames the columns in our DataFrame final_df.
# It's like labeling the rooms in our data mansion, making it easier to navigate and understand.
# Each column is assigned a new name based on its position in the DataFrame.
final_df.rename(columns={
    0: 'Region Name',
    1: 'Region Status',
    2: 'ISO',
    3: 'ANSI_Letter',
    4: 'ANSI_Code',
    5: 'USPS',
    6: 'USCG',
    7: 'GPO',
    8: 'AP',
    9: 'Other Abbreviations'
}, inplace=True)


In [14]:
# Time to reset the index of our rows and get things back in order.

# This line of code resets the index of the rows in our DataFrame final_df.
# It's like reorganizing the rooms in our data mansion, ensuring everything is in its proper place.
# By setting drop=True, we're dropping the current index and replacing it with a new sequential index.
final_df_reset = final_df.reset_index(drop=True)
