Some notes which I had taken down for pandas.

In [1]:
import pandas as pd
import numpy as np

Searching and replacing a pattern in a column in a pandas dataframe

In [2]:
d = {
    'id': [100, 200], 
    'details': ['{"name" => "abc", "age" => 23}', '{"name" => "def", "age" => 24}']
    }
df = pd.DataFrame(d)
df

Unnamed: 0,id,details
0,100,"{""name"" => ""abc"", ""age"" => 23}"
1,200,"{""name"" => ""def"", ""age"" => 24}"


Now let's replace the '=>' in the second column to convert the data into json format

In [3]:
df['details'] = df['details'].str.replace('=>', ':')
df

Unnamed: 0,id,details
0,100,"{""name"" : ""abc"", ""age"" : 23}"
1,200,"{""name"" : ""def"", ""age"" : 24}"


Lets add another column into the dataframe

In [4]:
sports = ['Football|Cricket', 'Football|Hockey|Chess|Badminton']
df['sports'] = sports
df

Unnamed: 0,id,details,sports
0,100,"{""name"" : ""abc"", ""age"" : 23}",Football|Cricket
1,200,"{""name"" : ""def"", ""age"" : 24}",Football|Hockey|Chess|Badminton


Now to convert the 'sports' column into a json array split by '|', we have apply the following logic of converting a list to a json array on the whole column- 

In [5]:
import json

In [6]:
x = [str(i) for i in range(10)]
x

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [7]:
json.dumps(x)

'["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]'

Now let's apply this to the whole column.

In [8]:
df['sports'] = df['sports'].str.split('|').apply(lambda x: json.dumps(x))
df

Unnamed: 0,id,details,sports
0,100,"{""name"" : ""abc"", ""age"" : 23}","[""Football"", ""Cricket""]"
1,200,"{""name"" : ""def"", ""age"" : 24}","[""Football"", ""Hockey"", ""Chess"", ""Badminton""]"


### Dropping columns in a Dataframe. 

Let's create a fake dataset using faker first

In [9]:
from faker import Faker

In [10]:
fake = Faker()

In [11]:
df = pd.DataFrame()
df['name'] = [fake.name() for _ in range(1000)]
df['ip'] = [fake.ipv4_private(network=False, address_class=None) for _ in range(1000)]
df['isTrue'] = np.random.randint(0, 2, size=1000, dtype=np.uint8).view(bool)
df['phone'] = [fake.phone_number() for _ in range(1000)]
df['job'] = [fake.job() for _ in range(1000)]
df['location'] = [fake.locale() for _ in range(1000)]
df['md5'] = [fake.md5(raw_output=False) for _ in range(1000)]

In [12]:
df

Unnamed: 0,name,ip,isTrue,phone,job,location,md5
0,Jimmy Mclean,10.70.202.157,False,03149870813,Information officer,el_GR,457ad5e643f3bd5d0ca853d9d16c42c9
1,Kristy Booth PhD,192.168.159.53,True,+46(1)9168137510,Community development worker,nso_ZA,c19203e19196cfe10340cded8c051d5d
2,Joseph Thomas,10.19.48.42,False,(306)348-2680x793,Environmental manager,st_ZA,f422b9599bf833131b4cbad7d966383b
3,Misty Ortega,192.168.139.200,False,1-200-180-6592x809,Insurance risk surveyor,an_ES,0950d6b203d9c53c1d5fa25e341802fd
4,Debra Tanner,10.59.10.37,False,(667)065-7829x175,"Production assistant, television",ku_TR,be785f221a060be0a7eb3b1cb528e9b7
5,Alexander Jordan,10.231.150.211,False,094-025-8959x972,Financial manager,nb_NO,3851f57a91c8984630a9cbce77ffdd16
6,Ann Hudson,172.19.181.214,False,(366)204-8540x54692,Sports development officer,tn_ZA,2cd5d17d35e8e2bc3042b36ae03771d4
7,Jacob Woods,10.39.64.23,True,1-713-211-2676,"Scientist, water quality",uk_UA,fd5d1da1f7af185755e713faa33e142d
8,Julia Nixon,192.168.159.97,True,(239)310-6216x878,"Psychologist, forensic",it_CH,8d8fe60b323a2725ab74f30e65dfc6c5
9,Elizabeth Pierce,10.30.17.55,False,940.181.8203x7708,Facilities manager,bhb_IN,7e77d7e9846627811f1ae6d8fd3b1542


Now if we want to remove the columns 'isTrue' and 'md5'

In [13]:
to_drop = ['isTrue', 'md5']
df.drop(to_drop, inplace=True, axis=1)
df

Unnamed: 0,name,ip,phone,job,location
0,Jimmy Mclean,10.70.202.157,03149870813,Information officer,el_GR
1,Kristy Booth PhD,192.168.159.53,+46(1)9168137510,Community development worker,nso_ZA
2,Joseph Thomas,10.19.48.42,(306)348-2680x793,Environmental manager,st_ZA
3,Misty Ortega,192.168.139.200,1-200-180-6592x809,Insurance risk surveyor,an_ES
4,Debra Tanner,10.59.10.37,(667)065-7829x175,"Production assistant, television",ku_TR
5,Alexander Jordan,10.231.150.211,094-025-8959x972,Financial manager,nb_NO
6,Ann Hudson,172.19.181.214,(366)204-8540x54692,Sports development officer,tn_ZA
7,Jacob Woods,10.39.64.23,1-713-211-2676,"Scientist, water quality",uk_UA
8,Julia Nixon,192.168.159.97,(239)310-6216x878,"Psychologist, forensic",it_CH
9,Elizabeth Pierce,10.30.17.55,940.181.8203x7708,Facilities manager,bhb_IN
