In [1]:
import pandas as pd
import re

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv("/data/loss.csv")

df = df.rename( columns = {
    "DM_EVENT.date_ops_start": "Date", 
    "DM_ACTIVITY.activity_code": "Activity Code",
    "DM_ACTIVITY.activity_memo": "Memo"
})

df.head()

Unnamed: 0,Date,Activity Code,Memo
0,9/1/2014,C2,Drop top plug. Cmt unit displace with 20 bbls ...
1,9/1/2014,C2,Drop bottom plug. Mix and pump 28 m3 of 1.90 S...
2,9/1/2014,C2,Drop top plug. Cement unit displace cement wit...
3,9/1/2014,C2,Rig pump 1.59 m3 of base oil ahead and 1.78 SG...
4,9/1/2014,F1,"Cont drilling 6-1/8"" hole from 3121 m to 3130 ..."


In [3]:
df.shape

(1499, 3)

In [4]:
df['Memo'] = df['Memo'].apply(lambda x: str(x).lower().strip() )

In [5]:
df.head()

Unnamed: 0,Date,Activity Code,Memo
0,9/1/2014,C2,drop top plug. cmt unit displace with 20 bbls ...
1,9/1/2014,C2,drop bottom plug. mix and pump 28 m3 of 1.90 s...
2,9/1/2014,C2,drop top plug. cement unit displace cement wit...
3,9/1/2014,C2,rig pump 1.59 m3 of base oil ahead and 1.78 sg...
4,9/1/2014,F1,"cont drilling 6-1/8"" hole from 3121 m to 3130 ..."


### Convert bbls to m3

1 bbl = 0.1589873 m3

In [6]:
def bblToM3(bbl):
    return float(bbl)*0.1589873

# Extract "loss circulation" value

### First Case

Pattern - Keyword `loss` follow by `Number` follow by `Unit`

In [7]:
# (?i)(loss[\s\w\=]*)\s+([0-9]+\.*[0-9]*)\s+(?i)(m3|bbl)
case1 = df['Memo'].str.extract(pat = "(loss[.]*)\s+([0-9]+\.*[0-9]*)\s+(m3|bbl)")

case1 = case1.rename( columns = {
    0: "KeyWords", 
    1: "Value",
    2: "Unit"
})

In [8]:
case1 = df.join(case1, how="inner")

In [9]:
case1[ case1["Value"].notnull() ]

Unnamed: 0,Date,Activity Code,Memo,KeyWords,Value,Unit
1,9/1/2014,C2,drop bottom plug. mix and pump 28 m3 of 1.90 s...,loss,6.2,m3
3,9/1/2014,C2,rig pump 1.59 m3 of base oil ahead and 1.78 sg...,loss,5.0,m3
7,9/1/2014,FN1C,sweep hole with lo-vis/hi-dense pill. circulat...,loss,3.6,m3
8,9/1/2014,FN1C,"circulate bottom up (2000 lpm, 2600 psi). tota...",loss,7.6,m3
11,9/1/2014,FN1C,"at 2167 m, observe tight spot. attempt to pass...",loss,10.8,m3
15,9/1/2014,CN2,circulate & kill well with driller first circu...,loss,4.0,m3
16,9/1/2014,CN2,establish circulation prior to cement job - 45...,loss,41.0,m3
17,9/1/2014,CN2,open well. flow check - observe flow continuou...,loss,5.0,m3
18,9/1/2014,C1,"established circulation with 300 lpm, 475 psi ...",loss,3.0,m3
27,9/1/2014,C1A,"hold pjsm. r/u fill up tool, wft 500 tons elev...",loss,3.0,m3


### Second Case

Pattern -  `Number` follow by `Unit` follow by  Keyword `loss` 

In [10]:
case2 = case1[ case1["Value"].isnull() ]

In [11]:
case2 = case2['Memo'].str.extract(pat = "([0-9]+\.*[0-9]*)\s+(m3|bbl[s]*)\s+(?i)(loss)")

case2 = case2.rename( columns = {
    0: "Value", 
    1: "Unit",
    2: "KeyWords"
})

# Reorder columns
case2 = case2[['KeyWords', 'Value', 'Unit']]
case2 = df.join(case2, how="inner")

In [12]:
case2[ case2["Value"].notnull() ]

Unnamed: 0,Date,Activity Code,Memo,KeyWords,Value,Unit
408,3/3/2014,C2,line up to cement unit. pump 3.2 m3 of sw. pre...,loss,4.13,m3
646,20/4/2014,C3,"continue n/d and pick up 13-5/8"" bop c/w riser...",loss,5.6,m3
782,26/4/2014,C2,drop top plug. displace with 0.8 m3 of 1.03 sg...,loss,3.0,bbl
1280,20/6/2014,P4F,hold pjsm. bleed off pressure on diverter over...,loss,1.0,bbl
1447,19/7/2014,F4,"circulate bottom up at 2495 m (400 lpm, 860 ps...",loss,6.0,bbls


In [13]:
merged = pd.concat([case1[ case1["Value"].notnull() ], case2[ case2["Value"].notnull() ] ])
merged["Value"] = merged["Value"].astype(float)

In [14]:
merged.shape

(224, 6)

In [15]:
merged.head()

Unnamed: 0,Date,Activity Code,Memo,KeyWords,Value,Unit
1,9/1/2014,C2,drop bottom plug. mix and pump 28 m3 of 1.90 s...,loss,6.2,m3
3,9/1/2014,C2,rig pump 1.59 m3 of base oil ahead and 1.78 sg...,loss,5.0,m3
7,9/1/2014,FN1C,sweep hole with lo-vis/hi-dense pill. circulat...,loss,3.6,m3
8,9/1/2014,FN1C,"circulate bottom up (2000 lpm, 2600 psi). tota...",loss,7.6,m3
11,9/1/2014,FN1C,"at 2167 m, observe tight spot. attempt to pass...",loss,10.8,m3


In [16]:
m3   = merged[ merged["Unit"] == "m3" ].copy()

# Convert bbls to m3
bbls = merged[ merged["Unit"].str.contains("bbl") ].copy()
bbls["Value"] = bbls["Value"].apply(bblToM3)

In [17]:
result = pd.concat([m3, bbls])["Value"]
result = df.join(result).fillna(0)

result = result.rename( columns = {
    "Date": "DM_EVENT.date_ops_start", 
    "Activity Code": "DM_ACTIVITY.activity_code",
    "Memo": "DM_ACTIVITY.activity_memo",
    "Value": "LOSS_CIRCULATION"
})

In [18]:
result.shape

(1499, 4)

In [19]:
result

Unnamed: 0,DM_EVENT.date_ops_start,DM_ACTIVITY.activity_code,DM_ACTIVITY.activity_memo,LOSS_CIRCULATION
0,9/1/2014,C2,drop top plug. cmt unit displace with 20 bbls ...,0.0
1,9/1/2014,C2,drop bottom plug. mix and pump 28 m3 of 1.90 s...,6.2
2,9/1/2014,C2,drop top plug. cement unit displace cement wit...,0.0
3,9/1/2014,C2,rig pump 1.59 m3 of base oil ahead and 1.78 sg...,5.0
4,9/1/2014,F1,"cont drilling 6-1/8"" hole from 3121 m to 3130 ...",0.0
5,9/1/2014,P4F,install circulating head and hose. circulate (...,0.0
6,9/1/2014,P4F,r/u cement bonnet. remove bpv. r/u cement head...,0.0
7,9/1/2014,FN1C,sweep hole with lo-vis/hi-dense pill. circulat...,3.6
8,9/1/2014,FN1C,"circulate bottom up (2000 lpm, 2600 psi). tota...",7.6
9,9/1/2014,G6,remove radioactive source. l/d slimxtreme. max...,0.0


In [20]:
result.to_csv("/data/loss-circulation-transformed.csv", index=False, encoding='utf-8')