In [None]:
#import libraries
import pandas as pd
import glob
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.patches as mpatches
style.use('ggplot')
import scipy.stats as stats
from scipy.stats import linregress,sem,ttest_ind

In [None]:
# initialize list of years to process
yrs_to_process=[1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,
                 2014,2015,2016,2017]

#list to hold marvel monthly count for a year
# marvel_mntly_top_thirty = []
#list to hold DC monthly count for a year
# dc_mntly_top_ten = []

#attributes /column names needed to plot charts
publishers_attributes = ["Marvel_Top_Thirty","DC_Top_Thirty",'Others_Top_Thirty','Marvel_Total_Sold',
                         'DC_Total_Sold','Others_Sold','Total_Sold','Total_Sales','Marvel_Total_Sales','DC_Total_Sales',
                         "Other_Total_Sales",'Marvel_price_avg','DC_price_avg','Others_price_avg'
                         ]

#create publishers stats dataframe with years as the index and attributes as columns
publishers_stats_df = pd.DataFrame(index=yrs_to_process ,columns=publishers_attributes)
# df_publishers_total_sold = pd.DataFrame(index=yrs_to_process ,columns="Marvel","DC",'Others')
#initialize all columns to zero
publishers_stats_df = publishers_stats_df.fillna(value=0)



# through all years initialized in dataframe
for index,row in publishers_stats_df.iterrows():
    #list to hold marvel monthly count for a year
    #reset marvel and DC monthly top 10 stats list when processing every year
    marvel_mntly_top_thirty = []
    dc_mntly_top_thirty = []

    
    #process each monthly json file for the year being processed
    for comichron_json in glob.glob("./source-data/" + str(index) + "-*.json"):
        if os.path.isfile(comichron_json):
            #print(f"Processing file {comichron_json}")
            #read json file
            monthly_df = pd.read_json(comichron_json)
            
            monthly_df["Year"] = index
            monthly_df["fileName"] = comichron_json
            
            #sum monthly count of comic books sold and add it to running total in publishers stats dataframe
            row['Total_Sold'] = row['Total_Sold'] + monthly_df["count"].sum()
            #sum monthly count of comic books for Marvel sold and add it to running total in publishers stats dataframe
            row['Marvel_Total_Sold'] += monthly_df[monthly_df["publisher"] == 'Marvel']["count"].sum()
            #sum monthly count of comic books for DC sold and add it to running total in publishers stats dataframe
            row['DC_Total_Sold'] += monthly_df[monthly_df["publisher"] == 'DC']["count"].sum()
            
            #sort and get top thirty comics sold
            monthly_top_30 = monthly_df.sort_values(by = "count" ,ascending=False ).head(30)
            #count how many times marvel occur in top 30
            marvel_mntly_top_thirty.append(len(monthly_top_30[monthly_top_30["publisher"] == 'Marvel']))
            #count how many times DC occur in top 30
            dc_mntly_top_thirty.append(len(monthly_top_30[monthly_top_30["publisher"] == 'DC']))
            

            monthly_df["fprice"] = monthly_df["price"].str.replace("$","")
            monthly_df["fprice"] = pd.to_numeric(monthly_df["fprice"])
            monthly_df["fcount"] = monthly_df["count"].astype(float)
            monthly_df["sales"] = monthly_df.fprice * monthly_df.fcount 
            row['Total_Sales'] += monthly_df["sales"].sum()
            row['Marvel_Total_Sales'] += monthly_df[monthly_df["publisher"] == 'Marvel']["sales"].sum()
            row['DC_Total_Sales'] += monthly_df[monthly_df["publisher"] == 'DC']["sales"].sum()
            
            
    
    #calculate average for the year
    row["Marvel_Top_Thirty"] = np.mean(marvel_mntly_top_thirty)
    row["DC_Top_Thirty"] = np.mean(dc_mntly_top_thirty)
    row["Others_Top_Thirty"] = 30 - (row["Marvel_Top_Thirty"] + row["DC_Top_Thirty"])
    row["Others_Sold"] = row['Total_Sold'] - (row["Marvel_Total_Sold"] + row["DC_Total_Sold"])
    row["Other_Total_Sales"] = row['Total_Sales'] - (row["Marvel_Total_Sales"] + row["DC_Total_Sales"])
    row['Marvel_price_avg'] = (row['Marvel_Total_Sales']/row["Marvel_Total_Sold"])
    row['DC_price_avg'] = row['DC_Total_Sales']/row["DC_Total_Sold"]
    row['Others_price_avg'] = row["Other_Total_Sales"] /row["Others_Sold"]




In [None]:
publishers_stats_df

In [None]:
#name index to year ,reset index and display calculated stats
dfs = np.split(publishers_stats_df, [3], axis=1)
top_30_publishers_stats_df = dfs[0]
top_30_publishers_stats_df = top_30_publishers_stats_df.reset_index()
top_30_publishers_stats_df = top_30_publishers_stats_df.rename(columns={'index': 'Year', 
                                                                        'Marvel_Top_Thirty': 'Marvel',
                                                                        'DC_Top_Thirty': 'DC',
                                                                        'Others_Top_Thirty': 'Others'  })
total_sold_publishers_stats_df = dfs[1]
total_sold_publishers_stats_df = total_sold_publishers_stats_df.reset_index()
total_sold_publishers_stats_df = total_sold_publishers_stats_df.rename(columns={'index': 'Year', 
                                                                        'Marvel_Total_Sold': 'Marvel',
                                                                        'DC_Total_Sold': 'DC',
                                                                        'Others_Sold': 'Others',       
                                                                        'Total_Sold': 'Total'  })
total_sold_publishers_stats_df 
dfs = np.split(dfs[1], [4], axis=1)
total_sales_publishers_stats_df = dfs[1]
total_sales_publishers_stats_df = total_sales_publishers_stats_df.reset_index()
total_sales_publishers_stats_df = total_sales_publishers_stats_df.rename(columns={'index': 'Year', 
                                                                        'Total_Sales': 'Total',
                                                                        'DC_Total_Sales': 'DC',
                                                                        'Marvel_Total_Sales': 'Marvel' , 
                                                                        'Other_Total_Sales': 'Others'})
total_sales_publishers_stats_df

In [None]:
top_30_publishers_stats_df


In [None]:
total_sold_publishers_stats_df

In [None]:
ax = top_30_publishers_stats_df[["Marvel","DC","Others"]]\
                        .plot(kind='bar',figsize=(15, 10),stacked = True,\
                         title="Marvel vs Dc vs Others Top 30 Comic Books Sold")
ax.set_xticklabels(yrs_to_process,rotation=45)
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.,title="Publishers")
plt.show()

In [None]:
# stats.ttest_ind(total_sold_publishers_stats_df["Total"], total_sold_publishers_stats_df["DC"], equal_var=False)

In [None]:
ax = total_sold_publishers_stats_df[["Marvel","DC"]].plot(kind='bar',figsize=(20, 10))
ax.set_xticklabels(yrs_to_process,rotation=45)
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.,title="Publishers")
# Set x and y limits
# plt.xlim(-0.25, len(x_axis))
# plt.ylim(0, max(rain_df["Inches"])+10)
plt.title("Marvel vs DC Total Comic Books Sold ")
plt.xlabel("Years")
plt.ylabel("Comics Sold (Tens of Millions)")
plt.show()


In [None]:
(t_stat_marvel,p_marvel) = ttest_ind(total_sold_publishers_stats_df["Total"],
                                     total_sold_publishers_stats_df["Marvel"], equal_var=False)
(t_stat_dc,p_dc) = ttest_ind(total_sold_publishers_stats_df["Total"],
                                 total_sold_publishers_stats_df["DC"], equal_var=False)
(t_stat_others,p_others) = ttest_ind(total_sold_publishers_stats_df["Total"],
                                 total_sold_publishers_stats_df["Others"], equal_var=False)

In [None]:
if p_marvel < 0.05:
    print("The difference in sample means is significant for Marvel.")
else:
    print("The difference in sample means is not significant for Marvel.")

In [None]:
if p_dc < 0.05:
    print("The difference in sample means is significant for DC.")
else:
    print("The difference in sample means is not significant for Marvel.")

In [None]:
tick_labels = ["Total","Marvel","DC","Others"]
means = [total_sold_publishers_stats_df["Total"].mean(),total_sold_publishers_stats_df["Marvel"].mean(),
         total_sold_publishers_stats_df["DC"].mean(),total_sold_publishers_stats_df["Others"].mean()]
x_axis = np.arange(0,len(means),1)
means

In [None]:
sem = [sem(total_sold_publishers_stats_df["Total"]),
        sem(total_sold_publishers_stats_df["Marvel"]),
        sem(total_sold_publishers_stats_df["DC"]),
       sem(total_sold_publishers_stats_df["Others"])]
sem


In [None]:
plt.errorbar(x_axis,means,sem,fmt="o",color='red')
plt.title("Average Comics sold vs Marvel vs DC vs Others")
plt.xlim(-0.5,3.5)
plt.ylim(0,85000000)
plt.xticks(x_axis,tick_labels)
plt.figure(figsize=(20,10))
plt.show()

In [None]:
# Labels for the sections of our pie chart
labels = ["Marvel", "DC", "Others"]

# The values of each section of the pie chart
sizes = []
sizes.append(publishers_stats_df["Marvel_Total_Sales"].sum())
sizes.append(publishers_stats_df["DC_Total_Sales"].sum())
sizes.append(publishers_stats_df["Other_Total_Sales"].sum())


# The colors of each section of the pie chart
# colors = ["yellowgreen", "red", "lightcoral", "lightskyblue"]

explode = (0.1, 0, 0)
sizes

In [None]:
# Creates the pie chart based upon the values above
# Automatically finds the percentages of each part of the pie chart
plt.pie(sizes, explode=explode, labels=labels, 
        autopct="%1.1f%%", shadow=True, startangle=140)

In [None]:
# Tells matplotlib that we want a pie chart with equal axes
plt.axis("equal")
plt.title("% of Total Sales by Publisher (1997-2017) $"
          + str(round(publishers_stats_df["Total_Sales"].sum()/1000000000,1)) + " Net Billion")
plt.show()

In [None]:
# stats.ttest_ind(publishers_stats_df["Marvel_Total_Sales"], publishers_stats_df["DC_Total_Sales"], equal_var=False)

In [None]:

(total_sold_marvel_slope, total_sold_marvel_int, _, _, _) = linregress(total_sales_publishers_stats_df["Year"],
                                                                total_sales_publishers_stats_df["Marvel"])
total_sold_marvel_fit = total_sold_marvel_slope * total_sales_publishers_stats_df["Year"] + total_sold_marvel_int

In [None]:
# Year = 2022
yrs_to_predict=[2018,2019,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,2030]
marvel_sales_predict = []
for Year in yrs_to_predict:
    sales_predict = total_sold_marvel_slope * Year + total_sold_marvel_int
    print(f"The total comics sales prediction for Marvel in {Year} will be {sales_predict}.")
    marvel_sales_predict.append(sales_predict)

In [None]:
(dc_sales_slope, dc_sales_int, _, _, _) = linregress(total_sales_publishers_stats_df["Year"], total_sales_publishers_stats_df["DC"])
dc_sales_fit = dc_sales_slope * total_sales_publishers_stats_df["Year"] + dc_sales_int

In [None]:
# Year = 2022
yrs_to_predict=[2018,2019,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,2030]
dc_sales_predict = []
for Year in yrs_to_predict:
    sales_predict = dc_sales_slope * Year + dc_sales_int
    print(f"The total comics sales prediction for DC in {Year} will be {sales_predict}.")
    dc_sales_predict.append(sales_predict)

In [None]:
sales_predictions_df = pd.DataFrame({
    "Year":yrs_to_predict,
    "Marvel":marvel_sales_predict,
    "DC":dc_sales_predict
    
})
sales_predictions_df

In [None]:
ax = sales_predictions_df[["Marvel","DC"]].plot(kind='bar',figsize=(20, 10))
ax.set_xticklabels(yrs_to_predict,rotation=45)
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.,title="Publishers")
# Set x and y limits
# plt.xlim(-0.25, len(x_axis))
# plt.ylim(0, max(rain_df["Inches"])+10)
plt.title("Marvel vs DC Total Comic Books Sales Prediction ")
plt.xlabel("Years")
plt.ylabel("Comics Sold (Tens of Millions)")
plt.show()