In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import numpy as np
from plotly.offline import init_notebook_mode,iplot
import pandas as pd
%matplotlib inline

In [None]:
#Selected Colors in HEX format
mycolors = ['#B2B6FF',  
            '#7AA4FA',  
            '#FFA290',  
            '#FFE2E7',  
            '#F48AB4',  
            '#8ED464',  
            '#BDE03E',  
            '#FD9B8E',  
            '#FF6671',  
            '#FD6148',  
            '#A6E2FC',  
            '#86DBD5',  
            '#F78199',  
            '#9A7ED2',  
            '#FS80B4',  
           ]

In [None]:
#Import Data -- thank data.world for data
df = pd.read_csv('https://query.data.world/s/hz6boyxvcbybd6jlnply3my4drwm5j')

df

**This Cell is to Explain the details of all Columns :**

* Ranking -- Game ranking based on the total sales (in millions)
* Name -- Name of the Game
* Platform -- Game Platforms like (Wii, NES, PS2, DS, GBA etc)
* Year -- Year of game release
* Genre -- Simply the game genre (sports, racing ... )
* publisher -- name of the publisher
* NA_Sales -- Sales in north america (in millions)
* EU_sales -- Sales in Europe (in millions)
* JP_sales -- Sales in Japan (in millions)
* Other_Sales -- Sales in Other Countries (in millions)
* Global_Sales -- Total sales world wide (in millions)

In [None]:
df.describe()

From above Dataframe, we conclude that :

* 500 games are ranked based on their sales
* Games released between 1980 to 2020
* Mean/Average sales in all regions are very low compare to the Max ...

In [None]:
print('display the shape of data (rows,columns)')
df.shape

In [None]:
print('Top 25 rows and sort by "Name" columns')
df.sort_values(by = ['Name']).head(5)

In [None]:
print('Top 5 rows from the dataset')
df.head(5)

In [None]:
print('Bottom 5 rows from dataset')
df.tail(5)

## To Print all Columns names

In [None]:
print('To print all col names')
for col in df.columns:
    print(col)

In [None]:
x = df['Name'].unique() #using numpy.ndarray to find all Names but only UNIQUE.
y = df['Genre'].unique()
z = df['Publisher'].unique()

In [None]:
print('Total Games by `Name` count(unique) :',len(x))
print('Total Games by `Genre` count(unique) :',len(y))
print('Total Games by `Publisher` count(unique) :',len(z))

# Exploratory Analysis and Visualization

Use Matplotlib, Seaborn library to visaulize the data

In [None]:
vg_plot =  df[0:25]
vg_plot

In [None]:
x = vg_plot['Rank']
y = vg_plot['Year']
plt.figure(figsize=(25,8), dpi=80)
plt.plot(x,y, label = 'Year', color = 'blue')
plt.xlabel('Rank')
plt.ylabel('Year')
plt.title('Global Sales by Rank For 25 Rows')
plt.legend()
plt.show()


In [None]:
#smooth estimate of the distribution
# Draw Plot
plt.figure(figsize=(25,8), dpi= 80)
sns.kdeplot(df.Global_Sales, shade=True, label = 'Global Sales', color="g", alpha=.7)

# Decoration
plt.title('Overall Global Sales Distribution', fontsize=16)
plt.legend()
plt.show()

In [None]:
total = vg_plot['Global_Sales']
NA = vg_plot['NA_Sales']
EU = vg_plot['EU_Sales']
JP = vg_plot['JP_Sales']
OTHER = vg_plot['Other_Sales']

In [None]:
plt.figure(figsize=(25,8), dpi= 80)
plt.grid(True)
plt.title('Comparision With all Countries with Global Sales')

plt.plot(total, label = 'Global')
plt.plot(NA, label = 'AMERICA')
plt.plot(EU, label = 'EUROPE')
plt.plot(JP, label = 'JAPAN')
plt.plot(OTHER, label = 'OTHER')
plt.legend(bbox_to_anchor =(1.0, 1.025), ncol = 2)

In [None]:
plt.figure(figsize=(25,8))
kwargs = dict(histtype='barstacked', alpha=0.3, bins=40)
plt.hist(total, **kwargs)
plt.hist(NA, **kwargs)
plt.hist(EU, **kwargs)
plt.hist(JP, **kwargs)
plt.hist(OTHER, **kwargs)
plt.xlabel('Global Sales')
plt.ylabel('Countries')
plt.title('Stepfield type of Comparision of Global with all Countries')

In [None]:
plt.figure(figsize=(10,7))
x = vg_plot['Year']
y = vg_plot['Global_Sales']
plt.title('Global sales occur (in Millions)')
plt.hist2d(x, y, bins=22, cmap='mako_r')
cb = plt.colorbar()
cb.set_label('counts in bin')

## Maximum games sold using Countplot method

In [None]:
plt.figure(figsize=(25,10))
sns.countplot('Year',data=df)
plt.title('Maximum Games sold on basis of Year')
plt.show()

## Top 10 Platforms, Genres, Publishers with Histogram plotting

In [None]:
#top platforms (name of the platform,total number of games developed for that platform)
topPlatforms_index = df.Platform.value_counts().head(10).index
topPlatforms_values = df.Platform.value_counts().head(10).values

#top genres (name of the genre,total number of games developed in that genre)
topGenres_index = df.Genre.value_counts().head(10).index
topGenres_values = df.Genre.value_counts().head(10).values

#top game developers/publishers (name of the publisher,total number of games published by that publisher)
topPublisher_index = df.Publisher.value_counts().head(10).index
topPublisher_values = df.Publisher.value_counts().head(10).values

fig, (ax1,ax2) = plt.subplots(1,2,figsize=(25,8), facecolor='white')

##top platforms used for games
ax1.vlines(x=topPlatforms_index, ymin=0, ymax=topPlatforms_values, color='#FF4D6D', linewidth=30)
ax1.set_title('Top 10 Platforms',fontsize=16)

#top genres of Games accordingly
ax2.vlines(x=topGenres_index, ymin=0, ymax=topGenres_values, color='#A609EF', linewidth=30)
ax2.set_title('Top 10 Genres',fontsize=16)
plt.show()

fig, ax = plt.subplots(figsize=(25,8), facecolor='white')

#top publishers of the games
ax.vlines(x=topPublisher_index, ymin=0, ymax=topPublisher_values, linewidth=65, color='#168AAD')
ax.set_title('Top 10 Publishers',fontsize=16)

**Conclution for above Bar Graph are :**

* DS and PS2 are the most popular platforms in comparison to others platform.
* Action is the most popular genre and the second most is the sports
* Electronic Arts has published 1300+ products

## Corellating the Games Sales among Countries and Global with Seaborn

Visualizing the multidimensional relationships among the samples is as easy as calling sns.pairplot:

In [None]:
# Corellating among all Continents/Countries using Seaborn to perform pairplot and to plot the graph with matplotlib:
sns.pairplot(df.loc[0:,['NA_Sales','EU_Sales','JP_Sales','Other_Sales','Global_Sales']])
plt.show()

**Conclution Upon performing the correlation among various countries :**
* North America is the major market as the Global sales are highly correlated with it.
* Europe is also an important region.
* One intresting thing is Japanies sales are not correlated with any region's sales,We can assume that JAPANIES people have different taste, when it's about games.

## TOP 15 GAMES IN OTHER USING BAR CHART (HORIZONTALLY)

In [None]:
df_sort_of_Other_Sales = df.sort_values(by = ['Other_Sales'], ascending=False)
top15 = df_sort_of_Other_Sales[0:15]
top15

In [None]:
plt.figure(figsize = (18,8))
plt.barh(top15["Name"],top15["Other_Sales"], label = 'Top Games')
plt.title("Top 15 games sold in Other",fontdict = {"fontsize":20})
plt.savefig("Top 15 games sold in Other.jpg",dpi = 300) #And to save it as an Jpeg image in the Directory
plt.legend()
plt.show()

In [None]:
Publisher = list(df.Publisher.unique())
global_sale_of_every_Publisher = pd.Series(dtype = float)
for pub in Publisher :
    data = df.loc[df.Publisher == pub]
    global_sale = sum(data.Global_Sales)
    global_sale_of_every_Publisher[pub] = global_sale

top_10 = global_sale_of_every_Publisher[:10]

In [None]:
plt.figure(figsize = (10.5,9))
plt.pie(top_10,labels = top_10.index,autopct = "%.2f%%",textprops = {"fontsize":13},labeldistance = 1.05, colors = mycolors)
plt.legend(loc = 4,fontsize  = 12, bbox_to_anchor =(1.75, 0.82), ncol = 2)
plt.title("Top 10 Publisher of Games",fontdict = {"fontsize":25,"fontweight":100})
plt.savefig("Top 10 Publisher of Games",dpi = 100)
plt.show()

## Percentage of Each Genre of Games 

In [None]:
Genre = df.Genre
Genre = Genre.value_counts()

In [None]:
plt.figure(figsize = (8,8))
labels = Genre.index
plt.pie(Genre,labels = labels,autopct = "%.2f%%",colors = mycolors) 
plt.title("Percentage of Top Genres of Games",fontdict = {"fontsize":17})
plt.savefig("Top Genres Chart",dpi = 100)
plt.show()

## Best Selling Games in  Countries

In [None]:
#Pie Plot

# For North America
df1 = pd.DataFrame(df.groupby('Name')['NA_Sales'].sum())
df1.sort_values(by=['NA_Sales'], inplace=True)
df1 = df1.tail(5)
df1.plot.pie(y='NA_Sales', autopct='%1.1f%%', figsize=(6, 6))
plt.title("Best selling games in North America")

# For Europe Sales
df1 = pd.DataFrame(df.groupby('Name')['EU_Sales'].sum())
df1.sort_values(by=['EU_Sales'], inplace=True)
df1 = df1.tail(5)
df1.plot.pie(y='EU_Sales', autopct='%1.1f%%', figsize=(6, 6))
plt.title("Best selling games in Europe")

# For Japan Sales
df1 = pd.DataFrame(df.groupby('Name')['JP_Sales'].sum())
df1.sort_values(by=['JP_Sales'], inplace=True)
df1 = df1.tail(5)
df1.plot.pie(y='JP_Sales', autopct='%1.1f%%', figsize=(6, 6))
plt.title("Best selling games in Japan")

# For India Sales
df1 = pd.DataFrame(df.groupby('Name')['Other_Sales'].sum())
df1.sort_values(by=['Other_Sales'], inplace=True)
df1 = df1.tail(5)
df1.plot.pie(y='Other_Sales', autopct='%1.1f%%', figsize=(6, 6))
plt.title("Best selling games in Other Countries")

## Video Game Sale Based on Genre (Global vs. OTHER)

In [None]:
df_genre = df.groupby('Genre')
def genreBased(region):
    xrange = np.arange(1,len(df_genre.sum())+1)
    fig,ax= plt.subplots(ncols=2,figsize=(18,6))
    df_to_plot = df_genre.sum().sort_values(by=region,ascending =False)[::-1]
    df_to_plot[region].plot(kind='barh')
    plt.title(region)
    #labels
    ax[1].set_ylabel(None)
    ax[1].tick_params(axis='both', which='major', labelsize=13)
    ax[1].set_xlabel('Total Sales(in millions)', fontsize=15,labelpad=21)
    #spines
    ax[1].spines['top'].set_visible(False)
    ax[1].spines['right'].set_visible(False)
    ax[1].grid(False)
    
    #annotations    
    for x,y in zip(np.arange(len(df_genre.sum())+1),df_genre.sum().sort_values(by=region,ascending =False)[::-1][region]):
        label = "{:}".format(y)
        labelr = round(y,2)
        plt.annotate(labelr, # this is the text
                     (y,x), # this is the point to label
                      textcoords="offset points",# how to position the text
                     xytext=(6,0), # distance from text to points (x,y)
                    ha='left',va="center")
     
    #donut chart
    theme = plt.get_cmap('Blues')
    ax[0].set_prop_cycle("color", [theme(1. * i / len(df_to_plot))for i in range(len(df_to_plot))])    
    wedges, texts,_ = ax[0].pie(df_to_plot[region], wedgeprops=dict(width=0.45), startangle=-45,labels=df_to_plot.index,
                      autopct="%.1f%%",textprops={'fontsize': 13,})

 
    plt.tight_layout()    

In [None]:
genreBased('Global_Sales') #ABOVE
genreBased('Other_Sales') #BELOW

## Video Game sale from 1985 to 2017 (Year Basis)

In [None]:
df_year = df.groupby('Year').sum().sort_values(by=['Year'],ascending = False)

fig,cc = plt.subplots(figsize=(18,6))
cc.plot(df_year.index,df_year['Global_Sales'],label ='Global',linewidth=2)
cc.plot(df_year.index,df_year['NA_Sales'],label ='North America',linewidth=2)
cc.plot(df_year.index,df_year['EU_Sales'],label ='Europe',linewidth=2)
cc.plot(df_year.index,df_year['JP_Sales'],label ='Japan',linewidth=2)
cc.plot(df_year.index,df_year['Other_Sales'],label ='Other',linewidth=2)

cc.legend(loc="center left")

cc.set_ylabel('Total Sales(in millions)', fontsize=15,labelpad=45)
cc.set_xticks(np.arange(1985,2017,1))

cc.tick_params(axis='both', which='major', labelsize=13)
cc.grid(False)

for item in cc.get_xticklabels():
    item.set_rotation(25)

#### Observations Based on above linear Graph (with subplots)
* As seen in the graph above video-game sales peaked in <b>2008-2010</b> across the globe.
* But the overall sales of <b>Japan</b> has remained consistent nontheless.   

## Displaying the trend of Analysis using Seaborn's Scatterplot and Distplot method

In [None]:
#Scatterplot Method
plt.figure(figsize=(20,8))
sns.scatterplot(df.Other_Sales[200:450], # X-axis taken in range from 200 to 450
                df.Global_Sales[200:450],  # Y-axis taken in range from 200 to 450
                hue=df.Rank,  # Dot color
                s=75)
plt.title("Scatterplot view of Global vs. Other Sales", fontdict={'fontsize':26})

#Xlabel and Ylabel are in millions

### Histograms and KDE can be combined using distplot

In [None]:
#Displot Method

df_name = ['Global','North America', 'Europe', 'Japan','Other']
j = 0
df_lst = [df.Global_Sales,
          df.NA_Sales,
          df.EU_Sales,
          df.JP_Sales,
          df.Other_Sales]

for i in df_lst:
    plt.figure(figsize=(20,8), dpi= 80)
    sns.distplot(i)
    plt.title('Displot Method for '+str(df_name[j])+' Sales', fontdict={'fontsize':26})
    j += 1

## Heatmap view of Games

In [None]:
plt.figure(figsize=(20,30), dpi= 50)
avg_stats = df.groupby('Publisher').mean()
gamesample = df.filter(like = 'Sales')[0:50]  
plt.title("Stats")
sns.heatmap(gamesample, annot=True, cmap='mako_r')
plt.ylim(0,51)
plt.ylabel('Total count of Publishing in Mean')

In [None]:
# Games sold based on all Columns

plt.figure(figsize=(20,12), dpi= 50)
#sold = sns.load_dataset(.corr())
sns.heatmap(df.corr('pearson'),annot=True, cmap = 'mako_r')

## Swarmplot with Sample color palette 

In [None]:
plt.figure(figsize=(15,15), dpi= 70)
sns.swarmplot(x='Year',
              y='Publisher',
              data=df[0:1000])
plt.title('Swarmplotting implementation for Year vs. Publisher', fontdict = {"fontsize":17})

## Trend of Publisher with maximum Games sold using Seaborn's Swarmplot

In [None]:
plt.figure(figsize=(17,9), dpi= 70)
sns.swarmplot(x='Global_Sales',
              y='Name',
              data=df[1:50], #Ranged from 1 to 50 
              hue='Publisher')
plt.title('Trend of Publisher that how much games have been sold')

# <center>Asking and Answering Questions</center>

### Q1.  How many games have been sold between 2005 and 2015 in Millions of all Countries and Globally ?

In [None]:
print('''Calculate total amount in millions, ranges from 2005 to 2015 and round it to Integer using round() function.''')
year_count = [i for i in range(2005, 2015)]
count_in_range = df.loc[df['Year'].isin(year_count)]

ns = sum(count_in_range.NA_Sales) #Total sales in North America
print('\nTotal Expenditure in North America from 2005 to 2015 is',round(ns),'Millions')

es = sum(count_in_range.EU_Sales) #Total sales in Europe
print('Total Expenditure in Europe from 2005 to 2015 is',round(es),'Millions')

js = sum(count_in_range.JP_Sales) #Total sales in Japan
print('Total Expenditure in Japan from 2005 to 2015 is',round(js),'Millions')

ins = sum(count_in_range.Other_Sales) #Sales in India
print('Total Expenditure in Other Countries from 2005 to 2015 is',round(ins),'Millions')

gs = sum(count_in_range.Global_Sales) #Global Sales
print('\nTotal Expenditure from Globally from 2005 to 2015 is',round(gs),'Millions')

### Q2. How to Plot using pointplot between the range of Year 2005 and 2015 ?

In [None]:
#2005-2015
first_filter = df.Year>2004
second_filter = df.Year<2016
newdata = df[first_filter&second_filter]

#visualization
sns.catplot(x="Year",y="NA_Sales",kind="point",
            data=newdata,
            hue = "Platform",
            palette='Set1',
            ci = None,
            edgecolor=None,
            height=8.27, 
            aspect=11.7/8.27)
plt.show()

### Q3. Create the Dataframe for Platforms and Publishers have been in top in maximum counts , sort it accordingly ?

In [None]:
top_platform = df.Platform.value_counts().head(15)
top_publisher = df.Publisher.value_counts().head(15)

#Top Platforms are Under 15
top_platform.to_frame()

In [None]:
#Top Publishers are under 15
top_publisher.to_frame()

### Q4. How to create a new dataframe with column name 'Name Of Game Which are Unique' and display all Unique Games and in which index starts from 1  ?

In [None]:
uni_array = df.Name.unique()
ind = [i for i in range(1,len(uni_array)+1)]

ddff = pd.DataFrame(data = uni_array,  
                  index = ind,  
                  columns = ['Name Of Game Which are Unique'])
ddff

### Q5. How to aggregate the mean, minimum, maximum of games based on Publisher For Global Sales Column ?

In [None]:
aggr_result = df.groupby('Publisher').agg({'Global_Sales': ['mean', 'min', 'max']})
aggr_result

### Q5.  How many games have been sold between 2005 and 2015 in Millions of all Platform ?

In [None]:
#Scatterplot Method
plt.figure(figsize=(20,8))
sns.scatterplot(df.Genre[200:450], # X-axis taken in range from 200 to 450
                df.Publisher[200:450],  # Y-axis taken in range from 200 to 450
                hue=df.Rank,  # Dot color
                s=75)
plt.title("Scatterplot view of Genre and Platform", fontdict={'fontsize':26})

#Xlabel and Ylabel are in millions

Thank you for visit in my project 

Future Works includes :
* Implementing Machine Learning models.
* Prediction of the number of users by using the regression model.
* Exploring the correlation between various Trends like Mostly sold on basis of each Name, Genre, Publisher and also on perticular date as well.