In [499]:
# Import Dependencies
import pandas as pd
import os

def readFile(path):
    return pd.read_json(path)

# Create a reference to the json and import it into a Pandas DataFrame
json_path = os.path.join("resources","purchase_data.json")
purchase_df = readFile(json_path)
# purchase_df

In [500]:
# Count the number of total players
totalCount = purchase_df["SN"].value_counts().count()
pd.DataFrame({"Total Players": [totalCount]})

Unnamed: 0,Total Players
0,573


In [501]:
# Create a new df with formatting
purchasingAnalysis_df = pd.DataFrame({
    "Number of Unique Items":[purchase_df["Item ID"].value_counts().count()],
    "Average Purchase Price":["$" + str(round(purchase_df["Price"].mean(),2))],
    "Total Number of Purchases":[len(purchase_df.index)],
    "Total Revenue":["$" + str(round(purchase_df["Price"].sum(),2))]
})

# Output
purchasingAnalysis_df[[
    "Number of Unique Items",
    "Average Purchase Price",
    "Total Number of Purchases",
    "Total Revenue"
]]

Unnamed: 0,Number of Unique Items,Average Purchase Price,Total Number of Purchases,Total Revenue
0,183,$2.93,780,$2286.33


In [502]:
# Create a new df removed duplicated "SN"
uniquePlayers_df = purchase_df[~purchase_df["SN"].duplicated()]
# uniquePlayers_df

# Group by gender
gender = uniquePlayers_df.groupby("Gender")

# Create a new df
gender_df = pd.DataFrame({
    "Percentage of Players": gender["Age"].count()/totalCount,
    "Total Count":gender["Age"].count()
})

# Format
gender_df["Percentage of Players"] = gender_df["Percentage of Players"].map("{:.2%}".format)

# Sort and Output
gender_df.sort_values(by="Total Count",ascending=False)

Unnamed: 0_level_0,Percentage of Players,Total Count
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,81.15%,465
Female,17.45%,100
Other / Non-Disclosed,1.40%,8


In [503]:
# Create a new df
purchasing_gender_df = pd.DataFrame({
    "Purchase Count": gender_df["Total Count"],
    "Average Purchase Price":gender["Price"].mean(),
    "Total Purchase Value":gender["Price"].sum(),
    "Normalized Totals":gender["Price"].mean()*gender["Price"].std()
})

#Format and Output
purchasing_gender_df[[
    "Purchase Count",
    "Average Purchase Price",
    "Total Purchase Value",
    "Normalized Totals"
]].style.format({
    "Average Purchase Price":"${:.2f}",
    "Total Purchase Value":"${:.2f}",
    "Normalized Totals":"${:.2f}"
})

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value,Normalized Totals
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,100,$2.89,$288.79,$3.24
Male,465,$2.99,$1389.72,$3.27
Other / Non-Disclosed,8,$3.45,$27.58,$2.66


In [504]:
# Set bins and labels
# TODO make it smarter
bins = [0,10,15,20,25,30,35,40,45]
group_labels = ["< 10","10 - 14","15 - 19","20 - 24","25 - 29","30 - 34","35 - 39","40 - 45"]
uniquePlayers_df["ageRange"] = pd.cut(uniquePlayers_df["Age"],bins,labels=group_labels)
# uniquePlayers_df.head()

# Group by ageRange
uniquePlayers_ageRange = uniquePlayers_df.groupby("ageRange")

# Format and Output
pd.DataFrame({
    "Percentage of Players":uniquePlayers_ageRange["Age"].count()/totalCount,
    "Total Count": uniquePlayers_ageRange["Age"].count()
}).style.format({
    "Percentage of Players": "{:.2%}"
})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0_level_0,Percentage of Players,Total Count
ageRange,Unnamed: 1_level_1,Unnamed: 2_level_1
< 10,3.84%,22
10 - 14,9.42%,54
15 - 19,24.26%,139
20 - 24,40.84%,234
25 - 29,9.08%,52
30 - 34,7.68%,44
35 - 39,4.36%,25
40 - 45,0.52%,3


In [505]:
# Create a new df
purchaseAnal_ageRange = pd.DataFrame({
    "Purchase Count": uniquePlayers_ageRange["Age"].count(),
    "Average Purchase Price": uniquePlayers_ageRange["Price"].mean(),
    "Total Purchase Value": uniquePlayers_ageRange["Price"].sum(),
    "Normalized Totals": uniquePlayers_ageRange["Price"].mean()*uniquePlayers_ageRange["Price"].std()
})

# Format and Output
purchaseAnal_ageRange[[
    "Purchase Count",
    "Average Purchase Price",
    "Total Purchase Value",
    "Normalized Totals"
]].style.format({
    "Average Purchase Price":"${:.2f}",
    "Total Purchase Value":"${:.2f}",
    "Normalized Totals":"${:.2f}"
})

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value,Normalized Totals
ageRange,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
< 10,22,$3.16,$69.50,$3.52
10 - 14,54,$2.89,$155.94,$3.16
15 - 19,139,$2.90,$403.12,$3.31
20 - 24,234,$2.99,$699.12,$3.30
25 - 29,52,$2.92,$151.65,$3.09
30 - 34,44,$3.33,$146.48,$3.30
35 - 39,25,$2.87,$71.64,$3.03
40 - 45,3,$2.88,$8.64,$2.48


In [506]:
# Group by SN
purchase_bySN = purchase_df.groupby("SN")

# Create a new df
topSpenders_df = pd.DataFrame({
    "Purchase Count" : purchase_bySN["Item ID"].count(),
    "Average Purchase Price": purchase_bySN["Price"].mean(),
    "Total Purchase Value": purchase_bySN["Price"].sum()
})

# Sort, Format, and Output
topSpenders_df[[
    "Purchase Count",
    "Average Purchase Price",
    "Total Purchase Value"
]].sort_values(
    by=["Purchase Count", "Average Purchase Price"],
    ascending=False
).head().style.format({
    "Average Purchase Price":"${:.2f}",
    "Total Purchase Value":"${:.2f}"
})

Unnamed: 0_level_0,Purchase Count,Average Purchase Price,Total Purchase Value
SN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Undirrala66,5,$3.41,$17.06
Saedue76,4,$3.39,$13.56
Mindimnya67,4,$3.18,$12.74
Sondastan54,4,$2.56,$10.24
Qarwen67,4,$2.49,$9.97


In [507]:
# Group by Item ID and Item Name
purchase_byItem = purchase_df.groupby(["Item ID","Item Name"])

# Create a new df
topItem_df = pd.DataFrame({
    "Purchase Count":purchase_byItem["SN"].count(),
    "Item Price":purchase_byItem["Price"].mean(),
    "Total Purchase Value":purchase_byItem["Price"].sum()
})

# Sort by Purchase Count, Format, and Output
topItem_df[[
    "Purchase Count",
    "Item Price","Total Purchase Value"
]].sort_values(
    by=["Purchase Count", "Total Purchase Value"],
    ascending=False
).head().style.format({
    "Item Price": "${:.2f}",
    "Total Purchase Value": "${:.2f}"
})

Unnamed: 0_level_0,Unnamed: 1_level_0,Purchase Count,Item Price,Total Purchase Value
Item ID,Item Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
39,"Betrayal, Whisper of Grieving Widows",11,$2.35,$25.85
84,Arcane Gem,11,$2.23,$24.53
34,Retribution Axe,9,$4.14,$37.26
31,Trickster,9,$2.07,$18.63
13,Serenity,9,$1.49,$13.41


In [498]:
# Sort by Total Purchase Value, Format and Output
topItem_df[[
    "Purchase Count",
    "Item Price",
    "Total Purchase Value"
]].sort_values(
    by=["Total Purchase Value", "Item Price"],
    ascending=False
).head().style.format({
    "Item Price": "${:.2f}",
    "Total Purchase Value": "${:.2f}"
})

Unnamed: 0_level_0,Unnamed: 1_level_0,Purchase Count,Item Price,Total Purchase Value
Item ID,Item Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
94,Mourning Blade,3,$3.64,$10.92
117,"Heartstriker, Legacy of the Light",2,$4.71,$9.42
93,Apocalyptic Battlescythe,2,$4.49,$8.98
90,Betrayer,2,$4.12,$8.24
154,Feral Katana,2,$4.11,$8.22
