In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
# importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
# loading datasets
customers=pd.read_csv('/kaggle/input/brazilian-ecommerce/olist_customers_dataset.csv')
sellers=pd.read_csv('/kaggle/input/brazilian-ecommerce/olist_sellers_dataset.csv')
order_reviews=pd.read_csv('/kaggle/input/brazilian-ecommerce/olist_order_reviews_dataset.csv')
order_items=pd.read_csv('/kaggle/input/brazilian-ecommerce/olist_order_items_dataset.csv')
products=pd.read_csv('/kaggle/input/brazilian-ecommerce/olist_products_dataset.csv')

In [5]:
geolocation = pd.read_csv('/kaggle/input/brazilian-ecommerce/olist_geolocation_dataset.csv')
product_category = pd.read_csv('/kaggle/input/brazilian-ecommerce/product_category_name_translation.csv')
orders = pd.read_csv('/kaggle/input/brazilian-ecommerce/olist_orders_dataset.csv')
payments = pd.read_csv('/kaggle/input/brazilian-ecommerce/olist_order_payments_dataset.csv')

In [6]:
# storing all datasets as a list for future reference

list_of_all_datasets=['customers','sellers','order_reviews','order_items','products','geolocation','product_category','orders','payments']

### Analysis of Customers Data

In [7]:
# print top 5 values
customers.head(5)

In [8]:
customers.shape

In [9]:
customers.isnull().sum()

#### We can see that there is no null value in Customers dataset
#### Now, we will try to find no of unique states and city from the Customers Dataset

In [10]:
customers.customer_city.unique()

In [11]:
len(customers.customer_city.unique())

#### There are total 4119 different Cities from where Customers visit

In [12]:
customers.customer_state.unique()

In [14]:
customers.customer_state

In [15]:
len(customers.customer_state.unique())

In [16]:
len(customers.customer_state)

### Visualisation of Customer Dataset

In [17]:
plt.figure(figsize=(13,7))
sns.histplot(x=customers['customer_state'],data=customers,color='blue')
plt.show()

#### From above histogram, we can see that most numbers of customer are from SP (Sao Paul) state,followed by Rj (Rio de Janeiro) state.

In [18]:
top_10_cities=customers['customer_city'].value_counts().nlargest(10)
top_10_cities

In [19]:
cities=customers['customer_city'].value_counts(ascending=True)
cities[0:20]

#### We can see that there are so many cites with count 1, they can be said to be city with least customers

In [20]:
city_df=customers.groupby('customer_city').count()['customer_id'].reset_index()

In [21]:
sns.barplot(data = city_df.sort_values('customer_id', ascending = False).nlargest(10,'customer_id'), x = 'customer_id', 
            y = 'customer_city',)
plt.title('Cities with the Most Customers')
plt.xlabel('City')
plt.ylabel('Number of Customers')

##### Again the city with most no of customers is Sao Paulo, followed by rio de janeiro

### Analyis of products and items

In [22]:
products.head()

In [23]:
order_items.head()

#### From above two datasets, we can see that we can 'inner join' these datasets on products id, Lets see

In [24]:
products_and_order_items_df=pd.merge(order_items,products)

In [25]:
products_and_order_items_df.head()

In [26]:
top_10_products=products_and_order_items_df['product_category_name'].value_counts().reset_index().nlargest(10, 'product_category_name')

In [29]:
lowest_10_products=products_and_order_items_df['product_category_name'].value_counts().reset_index().nsmallest(10, 'product_category_name')

In [27]:
top_10_products

In [30]:
lowest_10_products

### Payments Analysis

In [31]:
payments.head()

In [32]:
payments.payment_type.unique()

#### There are bascially 5 types of payment methods used by the customers

In [33]:
top_payment_type = payments['payment_type'].value_counts(ascending=False)

In [34]:
top_payment_type

#### Out of 5 types of methods, credit card is used on the top, then boleto and then voucher

In [35]:
type(top_payment_type)

#### Since this is a series object we can draw a histplot using its index and values

In [36]:
a = top_payment_type.index

In [37]:
b = top_payment_type.values

In [38]:
sns.barplot(x=a,y=b)

##### From the above bar graph we can see that,uses of Credit Card is the highest aroud 75000, then boleto that is slightly less than 20000, 

### Products Reviews

In [39]:
# we will first see if there is any relation between our product_and_order_items_df, and order_reviews
products_and_order_items_df.head(3)

In [40]:
# printing top 3 rows or order_reviews
order_reviews.head(3)

#### We can join it based on order_id column which is common to both, for this we will make another dataframe named "reviews_df'

In [41]:
reviews_df=pd.merge(products_and_order_items_df, order_reviews)

In [42]:
reviews_df.head(3)

In [43]:
reviews_df.shape

In [44]:
order_reviews['review_score'].value_counts()

In [45]:
order_reviews['review_score'].unique()

#### Ratings points are discrete, starting from 1 to 5.

In [46]:
sns.barplot(x=order_reviews['review_score'].value_counts().index,y=order_reviews['review_score'].value_counts().values)
plt.xlabel('Ratings')
plt.ylabel('Counts')

Most of the products have been rated 5,then 4.
also 1 rating is higher than 2 and 3


### Top Ten rated products

In [47]:
reviews_df.head(3)

In [48]:
product_reviews_mean = reviews_df.groupby('product_category_name').mean()['review_score'].reset_index()

In [49]:
product_reviews_mean.head(3)

In [50]:
top_10_ratings = product_reviews_mean.sort_values('review_score', ascending = False).nlargest(10,'review_score')
lowest_10_ratings = product_reviews_mean.sort_values('review_score', ascending = False).nsmallest(10,'review_score')

In [51]:
sns.barplot(data = top_10_ratings, x = 'review_score', y = 'product_category_name')
plt.title('Top 10 Product Ratings')
plt.xlabel('Average Rating')
plt.ylabel('Product Category Name')

#### Music, dvd, and cds category have the highest average ratings. after tha infant's fashion clothes come.

In [52]:
sns.barplot(data = lowest_10_ratings, x = 'review_score', y = 'product_category_name')
plt.title('Lowest 10 Product Ratings')
plt.xlabel('Average Rating')
plt.ylabel('Product Category Name')

#### Insurance Services have the worst ratings, followed by fraldas higiene products

In [53]:
# list print again list of all datasets
list_of_all_datasets

In [54]:
geolocation.head()

In [55]:
geolocation.isnull().sum()

In [56]:
len(geolocation.geolocation_city.unique())

#### There are 8011 unique city from geolocation data.