# Loading Data from Various Sources

In [1]:
import pandas as pd

In [3]:
df_csv = pd.read_csv('retail supermarket.csv')

In [5]:
df_excel = pd.read_excel('retail supermarket.xlsx')

In [7]:
df_json = pd.read_json('sample4.json')

# Exploring the Structure of a DataFrame

In [11]:
print(df_csv.head())

        Ship Mode    Segment        Country             City       State  \
0    Second Class   Consumer  United States        Henderson    Kentucky   
1    Second Class   Consumer  United States        Henderson    Kentucky   
2    Second Class  Corporate  United States      Los Angeles  California   
3  Standard Class   Consumer  United States  Fort Lauderdale     Florida   
4  Standard Class   Consumer  United States  Fort Lauderdale     Florida   

   Postal Code Region         Category Sub-Category     Sales  Quantity  \
0        42420  South        Furniture    Bookcases  261.9600         2   
1        42420  South        Furniture       Chairs  731.9400         3   
2        90036   West  Office Supplies       Labels   14.6200         2   
3        33311  South        Furniture       Tables  957.5775         5   
4        33311  South  Office Supplies      Storage   22.3680         2   

   Discount    Profit  
0      0.00   41.9136  
1      0.00  219.5820  
2      0.00    6.871

In [15]:
print(df_csv.shape)

(9994, 13)


In [19]:
print(df_csv.dtypes)

Ship Mode        object
Segment          object
Country          object
City             object
State            object
Postal Code       int64
Region           object
Category         object
Sub-Category     object
Sales           float64
Quantity          int64
Discount        float64
Profit          float64
dtype: object


In [25]:
print(df_csv.isnull().sum())

Ship Mode       0
Segment         0
Country         0
City            0
State           0
Postal Code     0
Region          0
Category        0
Sub-Category    0
Sales           0
Quantity        0
Discount        0
Profit          0
dtype: int64


In [27]:
print(df_excel.head())

        Ship Mode    Segment        Country             City       State  \
0    Second Class   Consumer  United States        Henderson    Kentucky   
1    Second Class   Consumer  United States        Henderson    Kentucky   
2    Second Class  Corporate  United States      Los Angeles  California   
3  Standard Class   Consumer  United States  Fort Lauderdale     Florida   
4  Standard Class   Consumer  United States  Fort Lauderdale     Florida   

   Postal Code Region         Category Sub-Category     Sales  Quantity  \
0        42420  South        Furniture    Bookcases  261.9600         2   
1        42420  South        Furniture       Chairs  731.9400         3   
2        90036   West  Office Supplies       Labels   14.6200         2   
3        33311  South        Furniture       Tables  957.5775         5   
4        33311  South  Office Supplies      Storage   22.3680         2   

   Discount    Profit  
0      0.00   41.9136  
1      0.00  219.5820  
2      0.00    6.871

In [29]:
print(df_excel.shape)

(9994, 13)


In [31]:
print(df_json.head())

                                              people
0  {'firstName': 'Joe', 'lastName': 'Jackson', 'g...
1  {'firstName': 'James', 'lastName': 'Smith', 'g...
2  {'firstName': 'Emily', 'lastName': 'Jones', 'g...


In [33]:
print(df_json.shape)

(3, 1)


In [35]:
print(df_json.dtypes)

people    object
dtype: object


In [41]:
print(df_json.isnull().sum())

people    0
dtype: int64


# Summarizing the Data Using Descriptive Statistics

In [44]:
print(df_csv.describe())

        Postal Code         Sales     Quantity     Discount       Profit
count   9994.000000   9994.000000  9994.000000  9994.000000  9994.000000
mean   55190.379428    229.858001     3.789574     0.156203    28.656896
std    32063.693350    623.245101     2.225110     0.206452   234.260108
min     1040.000000      0.444000     1.000000     0.000000 -6599.978000
25%    23223.000000     17.280000     2.000000     0.000000     1.728750
50%    56430.500000     54.490000     3.000000     0.200000     8.666500
75%    90008.000000    209.940000     5.000000     0.200000    29.364000
max    99301.000000  22638.480000    14.000000     0.800000  8399.976000


In [46]:
print(df_json.describe())

                                                   people
count                                                   3
unique                                                  3
top     {'firstName': 'Joe', 'lastName': 'Jackson', 'g...
freq                                                    1


In [48]:
print(df_csv.describe(include='all'))

             Ship Mode   Segment        Country           City       State  \
count             9994      9994           9994           9994        9994   
unique               4         3              1            531          49   
top     Standard Class  Consumer  United States  New York City  California   
freq              5968      5191           9994            915        2001   
mean               NaN       NaN            NaN            NaN         NaN   
std                NaN       NaN            NaN            NaN         NaN   
min                NaN       NaN            NaN            NaN         NaN   
25%                NaN       NaN            NaN            NaN         NaN   
50%                NaN       NaN            NaN            NaN         NaN   
75%                NaN       NaN            NaN            NaN         NaN   
max                NaN       NaN            NaN            NaN         NaN   

         Postal Code Region         Category Sub-Category      

In [50]:
print(df_json.describe(include='all')) 

                                                   people
count                                                   3
unique                                                  3
top     {'firstName': 'Joe', 'lastName': 'Jackson', 'g...
freq                                                    1


# Filtering and Selecting Specific Columns or Rows

In [55]:
df_filtered_columns = df_csv[['Region', 'Profit']]
print(df_filtered_columns.head())

  Region    Profit
0  South   41.9136
1  South  219.5820
2   West    6.8714
3  South -383.0310
4  South    2.5164


In [59]:
df_filtered_columns = df_csv[df_csv['Profit']>100]
print(df_filtered_columns.head())

         Ship Mode    Segment        Country           City       State  \
1     Second Class   Consumer  United States      Henderson    Kentucky   
13  Standard Class   Consumer  United States        Seattle  Washington   
24  Standard Class   Consumer  United States           Orem        Utah   
35     First Class  Corporate  United States     Richardson       Texas   
54  Standard Class  Corporate  United States  New York City    New York   

    Postal Code   Region         Category Sub-Category     Sales  Quantity  \
1         42420    South        Furniture       Chairs   731.940         3   
13        98103     West  Office Supplies      Binders   407.976         3   
24        84057     West        Furniture       Tables  1044.630         3   
35        75080  Central       Technology       Phones  1097.544         7   
54        10024     East       Technology       Phones  1029.950         5   

    Discount    Profit  
1        0.0  219.5820  
13       0.2  132.5922  
24   

In [61]:
df_filtered_by_index = df_csv.iloc[0:5]
print(df_filtered_by_index)

        Ship Mode    Segment        Country             City       State  \
0    Second Class   Consumer  United States        Henderson    Kentucky   
1    Second Class   Consumer  United States        Henderson    Kentucky   
2    Second Class  Corporate  United States      Los Angeles  California   
3  Standard Class   Consumer  United States  Fort Lauderdale     Florida   
4  Standard Class   Consumer  United States  Fort Lauderdale     Florida   

   Postal Code Region         Category Sub-Category     Sales  Quantity  \
0        42420  South        Furniture    Bookcases  261.9600         2   
1        42420  South        Furniture       Chairs  731.9400         3   
2        90036   West  Office Supplies       Labels   14.6200         2   
3        33311  South        Furniture       Tables  957.5775         5   
4        33311  South  Office Supplies      Storage   22.3680         2   

   Discount    Profit  
0      0.00   41.9136  
1      0.00  219.5820  
2      0.00    6.871