In [2]:
import pandas as pd
import numpy as np
z = np.arange(10, 16)
s = pd.Series(z, index=list('abcdef'))
print(s[2]) 
print(s['d'])

12
13


In [3]:
# A Series can be sliced in a way, very similar to slicing a python list.
print(s[1:4])
print(s['b':'e'])

b    11
c    12
d    13
dtype: int64
b    11
c    12
d    13
e    14
dtype: int64


In [4]:
# Pandas allows .loc, .iloc methods for selecting rows.
# Using square brackets ([]) is also allowed, especially for selecting columns.

# It is possible to understand a Series better by using describe method.
# The method provides details like mean, std, etc. about a series.

temp = pd.Series(28 + 10*np.random.randn(10))
print(temp.describe())

count    10.000000
mean     27.606039
std       7.303815
min      15.603680
25%      24.018772
50%      27.258706
75%      30.083322
max      43.279691
dtype: float64


In [6]:
# Two methods majorly info and describe can be used to know about the data, present in a data frame.

df = pd.DataFrame({'temp':pd.Series(28 + 10*np.random.randn(10)), 
                'rain':pd.Series(100 + 50*np.random.randn(10)),
             'location':list('AAAAABBBBB')})
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   temp      10 non-null     float64
 1   rain      10 non-null     float64
 2   location  10 non-null     object 
dtypes: float64(2), object(1)
memory usage: 368.0+ bytes
None


In [7]:
#  You can use include argument to white list data types that has to be included in the result.

print(df.describe(include=['object']))

       location
count        10
unique        2
top           A
freq          5


In [9]:
# Reading Data from JSON

import json

EmployeeRecords = [{'EmployeeID':451621, 'EmployeeName':'Preeti Jain', 'DOJ':'30-Aug-2008'},
{'EmployeeID':123621, 'EmployeeName':'Ashok Kumar', 'DOJ':'25-Sep-2016'},
{'EmployeeID':451589, 'EmployeeName':'Johnty Rhodes', 'DOJ':'04-Nov-2016'}]

emp_records_json_str = json.dumps(EmployeeRecords)
df = pd.read_json(emp_records_json_str, orient='records', convert_dates=['DOJ'])
print(df)

   EmployeeID   EmployeeName        DOJ
0      451621    Preeti Jain 2008-08-30
1      123621    Ashok Kumar 2016-09-25
2      451589  Johnty Rhodes 2016-11-04


In [10]:
# A single level index can be set to a data frame, by passing a list of values to either using index attribute or index argument of DataFrame function.

df = pd.DataFrame(np.random.rand(5,2))
df.index = [ 'row_' + str(i) for i in range(1, 6) ]
df

Unnamed: 0,0,1
row_1,0.085019,0.097037
row_2,0.030898,0.759178
row_3,0.853104,0.863325
row_4,0.038182,0.635454
row_5,0.981913,0.728722


In [12]:
# Consider the below defined data frame df, for understanding Data transformation.

df = pd.DataFrame({'temp':pd.Series(28 + 10*np.random.randn(10)),
                   'rain':pd.Series(100 + 50*np.random.randn(10)),
                   'location':list('AAAAABBBBB')
})
print(df.head(2))

# The below code replaces location names 'A' with 'Hyderabad' and 'B' with 'Mumbai'.

replacements = {
'location': {'A':'Hyderabad', 'B':'Mumbai'}
}
df = df.replace(replacements, regex=True)
print(df.head(2))

# It is also possible to filter rows, based on a column strings with a specific pattern and modify them.

mumbai_data = df.loc[df.location.str.contains('umb'),:]
print(mumbai_data.head(2))

        temp        rain location
0  18.489678  120.262364        A
1  33.921486  100.575743        A
        temp        rain   location
0  18.489678  120.262364  Hyderabad
1  33.921486  100.575743  Hyderabad
        temp        rain location
5  18.238409  161.043310   Mumbai
6  17.010997   40.842217   Mumbai


In [13]:
# groupby method can be used to group data and perform various function on each group

regions = df.groupby('location')
print(regions.mean())

                temp       rain
location                       
Hyderabad  31.414220  89.604635
Mumbai     25.730698  94.650765
