In this File:
- Reading JSON
- Handling Text Data 'str' methods
- Handling Missing Data


#### Reading JSON

In [2]:
import pandas as pd

df = pd.read_json('data.json')
print (df.to_string())

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.5
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112       NaN
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

-------------------------------------------------------------------------------------------------------------------------------
#### Working with Text values:
  - Lowercasing and Uppercasing
  - Splitting and Replacing
  - Concatenating
  - Other Pandas str() methods

In [3]:
data = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'], 
        'Age':[27, 24, 22, 32], 
        'Address':['Delhi', 'Kanpur', 'Allahabad', 'Kannauj'], 
        'Qualification':['Msc', 'MA', 'MCA', 'Phd']} 
   

df = pd.DataFrame(data) 
   
df["Name"]= df["Name"].str.lower()  # converting and overwriting values in column 
 
print(df)

     Name  Age    Address Qualification
0     jai   27      Delhi           Msc
1  princi   24     Kanpur            MA
2  gaurav   22  Allahabad           MCA
3    anuj   32    Kannauj           Phd


In [5]:
df["Name"]= df["Name"].str.upper() 
print(df)

     Name  Age    Address Qualification
0     JAI   27      Delhi           Msc
1  PRINCI   24     Kanpur            MA
2  GAURAV   22  Allahabad           MCA
3    ANUJ   32    Kannauj           Phd


In [17]:
#Split
data = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'], 
        'Age':[27, 24, 22, 32], 
        'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Knnuaj'], 
        'Qualification':['Msc', 'MA', 'MCA', 'Phd']} 

df = pd.DataFrame(data) 
df.dropna(inplace = True) # dropping null value columns to avoid errors 
df["Address"]= df["Address"].str.split("a", expand = True)
print(df)

     Name  Age Address Qualification
0     Jai   27       N           Msc
1  Princi   24       K            MA
2  Gaurav   22     All           MCA
3    Anuj   32    Knnu           Phd


In [29]:
#Replace
df = pd.DataFrame(data)
df["Age"]=df["Age"].replace(27,"Twenty Seven")
print(df)

     Name           Age    Address Qualification
0     Jai  Twenty Seven     Nagpur           Msc
1  Princi            24     Kanpur            MA
2  Gaurav            22  Allahabad           MCA
3    Anuj            32     Knnuaj           Phd


In [30]:
# Concat
df = pd.DataFrame(data)
new = df["Address"].copy()
df["Name"]= df["Name"].str.cat(new, sep =", ") 
print(df)

                Name  Age    Address Qualification
0        Jai, Nagpur   27     Nagpur           Msc
1     Princi, Kanpur   24     Kanpur            MA
2  Gaurav, Allahabad   22  Allahabad           MCA
3       Anuj, Knnuaj   32     Knnuaj           Phd


In [37]:
# Removing White Spaces
# We use str.strip(), str.rstrip(), str.lstrip()

df1 = {
    'State':[' Arizona AZ ',' Georgia GG ', ' Newyork NY','Indiana IN ','Florida FL '],
   'Score':[62,47,55,74,31]}

df1 = pd.DataFrame(df1,columns=['State','Score'])
print(df1)

          State  Score
0   Arizona AZ      62
1   Georgia GG      47
2    Newyork NY     55
3   Indiana IN      74
4   Florida FL      31


In [35]:
#strip()
df1['State'] = df1['State'].str.strip()
print (df1)

        State  Score
0  Arizona AZ     62
1  Georgia GG     47
2  Newyork NY     55
3  Indiana IN     74
4  Florida FL     31


In [38]:
#df1['State'] = df1['State'].str.lstrip()
df1['State'] = df1['State'].str.rstrip()

print (df1)

         State  Score
0  Arizona AZ      62
1  Georgia GG      47
2   Newyork NY     55
3  Indiana IN      74
4  Florida FL      31


https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.lstrip.html

-------------------------------------------------------------------------------------------------------------------------------

#### Handling Missing Data

Pandas provides some methods specific to missing data. To select NaN entries you can use pd.isnull() (or its companion pd.notnull())

- Replace NaN with a Scalar Value
- Fill NA Forward and Backward
- Drop Missing Values

In [42]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

print (df)

        one       two     three
a -0.793589 -1.931372  0.787836
b       NaN       NaN       NaN
c  1.773147 -1.536140  1.198220
d       NaN       NaN       NaN
e  0.194884  1.581445 -0.086927
f  0.823543 -0.437545 -0.472786
g       NaN       NaN       NaN
h  0.648061  0.834372  0.337886


- NaN means Not a Number.
- Check for Missing Values:
    -  isnull() and notnull() functions, which are also methods on Series and DataFrame objects

In [44]:
print (df['one'].isnull())

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool


In [45]:
print (df['one'].notnull())

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: one, dtype: bool


In [48]:
# 1. Replace NaN with Scalar Value "fillna()"

print (df)
print ("\n")
print ("NaN replaced with '0':")
print (df.fillna(0))

        one       two     three
a -0.793589 -1.931372  0.787836
b       NaN       NaN       NaN
c  1.773147 -1.536140  1.198220
d       NaN       NaN       NaN
e  0.194884  1.581445 -0.086927
f  0.823543 -0.437545 -0.472786
g       NaN       NaN       NaN
h  0.648061  0.834372  0.337886


NaN replaced with '0':
        one       two     three
a -0.793589 -1.931372  0.787836
b  0.000000  0.000000  0.000000
c  1.773147 -1.536140  1.198220
d  0.000000  0.000000  0.000000
e  0.194884  1.581445 -0.086927
f  0.823543 -0.437545 -0.472786
g  0.000000  0.000000  0.000000
h  0.648061  0.834372  0.337886


In [49]:
#2. FillNA Backward/Forward

# pad/fill - Fill methods Forward
# bfill/backfill - Fill methods Backward

print ("Fill Forward")
print (df.fillna(method='pad'))
print ("\n")
print ("Fill Backward")
print (df.fillna(method='backfill'))

Fill Forward
        one       two     three
a -0.793589 -1.931372  0.787836
b -0.793589 -1.931372  0.787836
c  1.773147 -1.536140  1.198220
d  1.773147 -1.536140  1.198220
e  0.194884  1.581445 -0.086927
f  0.823543 -0.437545 -0.472786
g  0.823543 -0.437545 -0.472786
h  0.648061  0.834372  0.337886


Fill Backward
        one       two     three
a -0.793589 -1.931372  0.787836
b  1.773147 -1.536140  1.198220
c  1.773147 -1.536140  1.198220
d  0.194884  1.581445 -0.086927
e  0.194884  1.581445 -0.086927
f  0.823543 -0.437545 -0.472786
g  0.648061  0.834372  0.337886
h  0.648061  0.834372  0.337886


In [50]:
#3. Drop Missing Values

print (df.dropna())

        one       two     three
a -0.793589 -1.931372  0.787836
c  1.773147 -1.536140  1.198220
e  0.194884  1.581445 -0.086927
f  0.823543 -0.437545 -0.472786
h  0.648061  0.834372  0.337886


In [51]:
#4. Replace generic values - we have to replace a generic value with some specific value. We can achieve this by applying the "replace" method.

df = pd.DataFrame({'one':[10,20,30,40,50,2000], 'two':[1000,0,30,40,50,60]})

print (df.replace({1000:10,2000:60}))

   one  two
0   10   10
1   20    0
2   30   30
3   40   40
4   50   50
5   60   60
