##### 1. Importing Pandas, the library we will be using in this tutorial

In [2]:
import pandas as pd


##### 2. Reading data under different extensions

* From csv

In [3]:
df = pd.read_csv('./online_shoppers.csv')

In [5]:
# Let's use head() to check the top 3 rows of the data
df.head(3)

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,6,171,0.0,0.02,0.0,0.0,Dec,2,2,3,2,Returning_Visitor,False,False
1,11,232.009453,6,274.534908,206,8064,0.0085,0.019,3.4,0.0,Feb,1,2,1,3,Returning_Visitor,False,True
2,0,0.0,0,0.0,27,1029,0.0,0.009,0.0,0.0,Nov,2,2,7,2,Returning_Visitor,False,False


* From other file types

|File type              |Pandas Function|
|--------------         |:-------------:|
|excel                  |pd.read_excel()|
|hdf                    |pd.read_hdf()  |
|json                   |pd.read_json() |
|sql                    |pd.read_sql()  |
|...                    |               |



##### 3. Quick data exploration

* Data types
<div class='alert-success alertsuccess'>
<p>Used to quickly check the types of data in the dataframe</p>
</div>

In [6]:
df.dtypes

Administrative               int64
Administrative_Duration    float64
Informational                int64
Informational_Duration     float64
ProductRelated               int64
ProductRelated_Duration      int64
BounceRates                float64
ExitRates                  float64
PageValues                 float64
SpecialDay                 float64
Month                       object
OperatingSystems             int64
Browser                      int64
Region                       int64
TrafficType                  int64
VisitorType                 object
Weekend                       bool
Revenue                       bool
dtype: object

* Describe
<div class='alert-success alertsuccess'>
<p>Used to quickly get the statistics of the dataframe</p>
</div>

In [7]:
df.describe()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,3.324412,103.087362,0.587916,45.201377,34.344931,1361.09927,0.024095,0.04517,5.623058,0.069002,2.162855,2.416869,3.079481,4.05953
std,5.338629,215.123152,1.425501,166.154196,50.354704,2307.16313,0.0505,0.050037,17.368645,0.208337,0.916189,1.759884,2.439205,4.103281
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,189.0,0.0,0.015,0.0,0.0,2.0,2.0,1.0,2.0
50%,1.0,15.714109,0.0,0.0,18.0,656.5,0.0044,0.027,0.0,0.0,2.0,2.0,2.0,2.0
75%,4.0,118.881631,0.0,0.0,40.0,1571.0,0.0191,0.05,0.0,0.0,3.0,2.0,4.0,4.0
max,24.0,2152.047318,12.0,2037.449693,500.0,23837.0,0.2,0.2,258.2,1.0,8.0,13.0,9.0,20.0


In [8]:
# we can use include argument to specify that we want statistics for non numerical variables to.
df.describe(include='all')

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330,12330.0,12330.0,12330.0,12330.0,12330,12330,12330
unique,,,,,,,,,,,10,,,,,3,2,2
top,,,,,,,,,,,May,,,,,Returning_Visitor,False,False
freq,,,,,,,,,,,3325,,,,,10825,9595,10428
mean,3.324412,103.087362,0.587916,45.201377,34.344931,1361.09927,0.024095,0.04517,5.623058,0.069002,,2.162855,2.416869,3.079481,4.05953,,,
std,5.338629,215.123152,1.425501,166.154196,50.354704,2307.16313,0.0505,0.050037,17.368645,0.208337,,0.916189,1.759884,2.439205,4.103281,,,
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0,1.0,1.0,1.0,,,
25%,0.0,0.0,0.0,0.0,7.0,189.0,0.0,0.015,0.0,0.0,,2.0,2.0,1.0,2.0,,,
50%,1.0,15.714109,0.0,0.0,18.0,656.5,0.0044,0.027,0.0,0.0,,2.0,2.0,2.0,2.0,,,
75%,4.0,118.881631,0.0,0.0,40.0,1571.0,0.0191,0.05,0.0,0.0,,3.0,2.0,4.0,4.0,,,


* Info
<div class='alert-success alertsuccess'>
<p>Used to quickly get a summary of the dataframe</p>
</div>

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  int64  
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           