In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import scipy as sp
import warnings
warnings.filterwarnings('ignore')

1. Have the size of the sets changed over time?

In [2]:
df_sets = pd.read_csv("/kaggle/input/lego-database/sets.csv")
df_sets.head()

Unnamed: 0,set_num,name,year,theme_id,num_parts
0,00-1,Weetabix Castle,1970,414,471
1,0011-2,Town Mini-Figures,1978,84,12
2,0011-3,Castle 2 for 1 Bonus Offer,1987,199,2
3,0012-1,Space Mini-Figures,1979,143,12
4,0013-1,Space Mini-Figures,1979,143,12


In [3]:
df_sets.sort_values(by='num_parts', ascending = False)[0:4]

Unnamed: 0,set_num,name,year,theme_id,num_parts
170,10189-1,Taj Mahal,2008,276,5922
11614,SWMP-1,Star Wars / M&M Mosaic - Promo Set,2005,169,5461
1337,2000409-1,Window Exploration Bag,2010,507,5200
161,10179-1,Millennium Falcon - UCS,2007,174,5195


In [4]:
biggest_set = df_sets.sort_values('num_parts', ascending=False).iloc[0]
print(f"The biggest set is: {biggest_set['name']} with {biggest_set['num_parts']} parts.")

The biggest set is: Taj Mahal with 5922 parts.


In [9]:
df_sets_per_year = df_sets.groupby('year').agg(mean_part=('num_parts', np.mean),
                                        std=('num_parts', np.std),
                                        max_part = ('num_parts', np.max),
                                        min_part = ('num_parts', np.min),
                                        theme_nbr = ('theme_id' , pd.Series.nunique),
                                        set_nbr = ('set_num', len))

Fig_1 = px.scatter(df_sets_per_year,
                   x = df_sets_per_year.index,
                   y = 'mean_part',
                   labels = {'year':'Year',
                          'mean_part' : "Average number of part",
                          'theme_nbr' : 'Number of theme'},
                   title = "Evolution of the mean size of LEGO's set from 1950",
                   height = 600,
                   width = 1200)

Fig_1.update_traces(marker=dict(size=10)
                   )

Fig_1.show()

r, p = sp.stats.pearsonr(df_sets_per_year.index.values, df_sets_per_year['mean_part'])

if p < 0.05:
    print("The evolution of the size of LEGO's set over time is significant.")
else:
    print("There has not been significant changes in the size of LEGO's set over time.")

The evolution of the size of LEGO's set over time is significant.


In [299]:
Fig_1 = px.scatter(df_sets_per_year,
                   x = df_sets_per_year.index,
                   y = 'set_nbr',
                   color = 'theme_nbr',
                   labels = {'year':'Year',
                          'theme_nbr' : "Number of themes",
                          'set_nbr':'Number of sets'},
                   title = "Number of sets per year",
                   color_continuous_scale=px.colors.sequential.Viridis,
                   height = 600,
                   width = 1200,
                   size = 'theme_nbr')


Fig_1.show()

In [10]:
Fig_1 = px.scatter(df_sets_per_year,
                   x = 'theme_nbr',
                   y = 'mean_part',
                   color = df_sets_per_year.index,
                   size = 'mean_part',
                   labels = {'set_nbr':'Number of sets',
                          'mean_part' : "Average number of parts",
                          'theme_nbr' : 'Number of themes'},
                   height = 600,
                   width = 800)


Fig_1.show()

- Yes, the size of LEGO'set has largely increases since the first sets released in 1950.
- The number of sets released each year has also increased and is related to the increasing number of theme proposed by LEGO, which is not highly surprising.
- The average number of parts per set trend to increase with the number of themes, which simply reflects the evolution of LEGO's industry since 1950 (more themes, more sets, larger sets). We can see however that large set were already proposed at the beginning (e.g. in 1960).

In [11]:
df_themes = pd.read_csv("/kaggle/input/lego-database/themes.csv")
df_themes.head()

Unnamed: 0,id,name,parent_id
0,1,Technic,
1,2,Arctic Technic,1.0
2,3,Competition,1.0
3,4,Expert Builder,1.0
4,5,Model,1.0


In [12]:
df_merged = pd.merge(df_sets, df_themes, left_on = 'theme_id', right_on = 'id')
df_merged.drop(['theme_id', 'id', 'parent_id'], axis = 1, inplace = True)
df_merged.rename(columns = {'name_x' : 'set',
                            'name_y' : 'theme'}, inplace = True)
df_merged.head()

Unnamed: 0,set_num,set,year,num_parts,theme
0,00-1,Weetabix Castle,1970,471,Castle
1,0011-2,Town Mini-Figures,1978,12,Supplemental
2,0011-3,Castle 2 for 1 Bonus Offer,1987,2,Lion Knights
3,0012-1,Space Mini-Figures,1979,12,Supplemental
4,0013-1,Space Mini-Figures,1979,12,Supplemental


In [327]:
df_merged.theme.value_counts()

theme
Supplemental           496
Technic                435
City                   287
Friends                269
Basic Set              257
                      ... 
Ghostbusters             1
Planet Series 3          1
Star Wars Episode 8      1
Imperial Guards          1
Indiana Jones            1
Name: count, Length: 386, dtype: int64

In [13]:
resume_theme = df_merged.groupby(['theme']).agg(mean_part = ('num_parts' , 'mean'), std = ('num_parts' , 'std'), set_nbr = ('set', len))
# Use len method rather to pd.Series.nunique to determine the number of sets per theme as some identical sets are released at different year.

top15_theme = resume_theme.sort_values(by = 'set_nbr', ascending = False)[0:15]

Fig = px.bar(top15_theme,
                 x = top15_theme['set_nbr'],
                 height = 500,
                 width = 800,
                 title = 'TOP 15 of themes based on the number of sets',
                 labels = {'theme':'Theme',
                          'mean_part' : 'Average number of parts',
                          'set_nbr' : 'Number of sets'})

Fig.show()

In [15]:
top15_bigger_theme =  resume_theme.sort_values(by = 'mean_part', ascending = False)[0:15]

Fig = px.scatter(top15_bigger_theme,
                 y = top15_bigger_theme.index,
                 x = top15_bigger_theme['mean_part'],
                 error_x = 'std',
                 color = top15_bigger_theme.index,
                 text = 'set_nbr',
                 height = 500,
                 width = 800,
                 title = 'TOP 15 of biggest themes based on the average number of parts per set',
                 labels = {'theme':'Theme',
                          'mean_part' : 'Average number of parts',
                          'set_nbr' : 'Number of sets'})

Fig.update_layout(showlegend=False)

Fig.update_traces(marker=dict(size=20))

Fig.show()

In [65]:
theme_released_year = df_merged.groupby('year')['theme'].value_counts().unstack()

Fig = px.bar(theme_released_year,
            x = theme_released_year.index,
            y = theme_released_year.columns,
            title = 'Number of sets released by year by theme',
            labels = {'value' : 'Number of sets',
                     'theme' : 'Theme',
                     'year' : 'Year'})
Fig.show()

In [67]:
top10_theme = theme_released_year.sum().sort_values(ascending = False)[:10].index.values

Fig = px.bar(theme_released_year,
            x = theme_released_year.index,
            y = top10_theme,
            title = 'Number of sets released by year for the 10 most popular themes',
            labels = {'value' : 'Number of sets',
                     'variable' : 'Theme',
                     'year' : 'Year'})
Fig.show()

In [35]:
top10_theme = theme_released_year.sum().sort_values(ascending = False)[:10].index.values
top10_theme

array(['Supplemental', 'Technic', 'City', 'Friends', 'Basic Set',
       'Creator', 'Gear', 'Service Packs', 'Duplo', 'Star Wars'],
      dtype=object)

In [32]:
 theme_released_year.sum().sort_values(ascending = False)[:10].index

Index(['Supplemental', 'Technic', 'City', 'Friends', 'Basic Set', 'Creator',
       'Gear', 'Service Packs', 'Duplo', 'Star Wars'],
      dtype='object', name='theme')

In [325]:
theme_per_year = df_merged.groupby('theme')['set'].nunique()


theme_per_year

# Fig = px.bar(df_merged,
#             x = 'year',
#             y = 'theme')

# Fig.show()

theme
12V          50
4 Juniors     2
4.5V         77
9V           51
Advent        3
             ..
X-Men         3
X-Pod        19
Xalax        16
Znap         19
eLAB          1
Name: set, Length: 386, dtype: int64

In [None]:
df_color = pd.read_csv("/kaggle/input/lego-database/colors.csv")
# df_sets = pd.read_csv("/kaggle/input/lego-database/sets.csv")
# df_themes = pd.read_csv("/kaggle/input/lego-database/themes.csv")
df_inventory_sets = pd.read_csv("/kaggle/input/lego-database/inventory_sets.csv")
df_inventories = pd.read_csv("/kaggle/input/lego-database/inventories.csv")
df_inventory_parts = pd.read_csv("/kaggle/input/lego-database/inventory_parts.csv")
df_parts = pd.read_csv("/kaggle/input/lego-database/parts.csv")
df_part_categories = pd.read_csv("/kaggle/input/lego-database/part_categories.csv")

2. What colors are associated with which theme?