In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext, Row
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import pyspark.sql.functions as F
from pyspark.sql.functions import when, udf, col, regexp_replace, regexp_extract
from pyspark.sql.types import DoubleType,IntegerType, StringType 

In [2]:
sc = SparkContext( 'local' )  
sqlCtx = SQLContext( sc )

In [3]:
data =  [('2345', 'Checked by John'),
        ('2398','Verified by Stacy'),
        ('2328','Verified by Srinivas than some random text'),        
        ('3983','Double Checked on 2/23/17 by Marsha')]

df   = sc.parallelize(data).toDF(['ID','Notes'] )
df.show()

+----+--------------------+
|  ID|               Notes|
+----+--------------------+
|2345|     Checked by John|
|2398|   Verified by Stacy|
|2328|Verified by Srini...|
|3983|Double Checked on...|
+----+--------------------+



In [4]:
df.where( df['Notes'].rlike('John')).show()

+----+---------------+
|  ID|          Notes|
+----+---------------+
|2345|Checked by John|
+----+---------------+



In [5]:
df.withColumn( 'fname', regexp_extract( df['Notes'], 'by [a-zA-Z]+',0) ).show()

+----+--------------------+-----------+
|  ID|               Notes|      fname|
+----+--------------------+-----------+
|2345|     Checked by John|    by John|
|2398|   Verified by Stacy|   by Stacy|
|2328|Verified by Srini...|by Srinivas|
|3983|Double Checked on...|  by Marsha|
+----+--------------------+-----------+



In [6]:
df.withColumn( 'fname', regexp_extract( df['Notes'], '(by) ([a-zA-Z]+)',1) ).show()

+----+--------------------+-----+
|  ID|               Notes|fname|
+----+--------------------+-----+
|2345|     Checked by John|   by|
|2398|   Verified by Stacy|   by|
|2328|Verified by Srini...|   by|
|3983|Double Checked on...|   by|
+----+--------------------+-----+



In [7]:
df.withColumn( 'fname', regexp_extract( df['Notes'], '(by) ([a-zA-Z]+)',2) ).show()

+----+--------------------+--------+
|  ID|               Notes|   fname|
+----+--------------------+--------+
|2345|     Checked by John|    John|
|2398|   Verified by Stacy|   Stacy|
|2328|Verified by Srini...|Srinivas|
|3983|Double Checked on...|  Marsha|
+----+--------------------+--------+



In [8]:
df.withColumn( 'fname', regexp_extract( df['Notes'], '([a-zA-Z0-9_/]+) by ([a-zA-Z]+)',1) ).show()

+----+--------------------+--------+
|  ID|               Notes|   fname|
+----+--------------------+--------+
|2345|     Checked by John| Checked|
|2398|   Verified by Stacy|Verified|
|2328|Verified by Srini...|Verified|
|3983|Double Checked on...| 2/23/17|
+----+--------------------+--------+



In [9]:
df.withColumn( 'fname', regexp_extract( df['Notes'], '([\w/]+) by ([a-zA-Z]+)',1) ).show()

+----+--------------------+--------+
|  ID|               Notes|   fname|
+----+--------------------+--------+
|2345|     Checked by John| Checked|
|2398|   Verified by Stacy|Verified|
|2328|Verified by Srini...|Verified|
|3983|Double Checked on...| 2/23/17|
+----+--------------------+--------+



In [10]:
df.withColumn( 'fname', regexp_replace( df['Notes'], 'by [a-zA-Z]+', 'and') ).show()

+----+--------------------+--------------------+
|  ID|               Notes|               fname|
+----+--------------------+--------------------+
|2345|     Checked by John|         Checked and|
|2398|   Verified by Stacy|        Verified and|
|2328|Verified by Srini...|Verified and than...|
|3983|Double Checked on...|Double Checked on...|
+----+--------------------+--------------------+

