# Install Dependencies

In [1]:
!pip install pyspark



# Import NLTK


In [2]:
import string
import nltk
import re
import csv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from itertools import groupby

# Read CSV

In [3]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

max_length = 4

from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("COMP5349 Assignment1") \
    .getOrCreate()

def row_to_tuple_gl (row):
  row_dict = row.asDict()
  return (row_dict["Filename"], row["Governing Law"])

def row_to_tuple_ac (row):
  row_dict = row.asDict()
  return (row_dict["Filename"], row["Change of Control"], row["Anti-assignment"])

# Use pySpark SQL to read CSV file
governing_law_rdd_row = spark.read.csv("Governing_Law.csv",header=True).rdd
Assignment_CIC_rdd_row = spark.read.csv("Anti_assignment_CIC_g3.csv",header=True).rdd

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/evanxu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
22/04/12 15:53:07 WARN Utils: Your hostname, evanxus-MBP.local resolves to a loopback address: 127.0.0.1; using 172.24.0.4 instead (on interface feth4532)
22/04/12 15:53:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/12 15:53:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# General Utility Functions

In [4]:
def extractPhrase(clause):
    """ This function erases the stop words and punctuation, then converts the 
    rest of the clause into groups of words
    
    Args:
        clause (str): A string of clause 
    Returns:
        The return value is a list of listed words interrupted by stopwords 
    """
    clause = re.sub('[%s]' % re.escape(string.punctuation), '', clause)
    word_list = word_tokenize(clause)
    groups = groupby(word_list, lambda x: x not in stop_words)
    phrases: List[Phrase] = [tuple(group[1]) for group in groups if group[0]]

    return list(filter(lambda x: len(x) <= max_length, phrases))

def flattenFirst(phrase_list):
    """ This function preflat the 2D phrase_list into a 1D word list.
    
    Args:
        phrase_list (str list): A 2D list of phrase
    Returns:
        The return value is a list of words 
    """
    word_list = []
    for phrase in phrase_list:
        for word in phrase:
            word_list.append(word)
    return word_list

def getCoocur(phrase_list):
    word_deg = {}
    for phrase in phrase_list:
        for word in phrase:
            word_deg.setdefault(word, 0)
            word_deg[word] += len(phrase) - 1
    return word_deg

def getElectedKeywords(doc):
    """ This function calculate the first 4 elected words.
    
    Args:
        phrase_list (str list): A string list of candidates
    Returns:
        The return value is a list of elected words
    """
    freq = {}
    deg = {}
    score = {}
    score_can = {}
    for candidate in doc:
        for word in candidate:
            freq.setdefault(word, 0)
            freq[word] += 1

    for candidate in doc:
        for word in candidate:
            deg.setdefault(word, 0)
            deg[word] += len(candidate) - 1
    
    for candidate in doc:
        for word in candidate:
            score.setdefault(word, 0)
            score[word] = (deg[word] + freq[word]) / freq[word]
            
    for candidate in doc:
        score_can.setdefault(candidate, 0)
        for word in candidate:
            score_can[candidate] += score[word]
    sorted_score_can = sorted(score_can.items(), key=lambda d: d[1], reverse=True)
    return sorted_score_can[:4]

# Method 1
Each clause as a document and each category as a corpus.

## Governing Law

In [5]:
def row_to_tuple_gl_doc (row):
    row_dict = row.asDict()
    gl = row_dict["Governing Law"]
    # Remove page info
    gl = re.sub('\(Page.*?\)', '', gl)
    # Remove continuous space
    gl = re.sub(' +', ' ', gl)
    gl = gl.lower()
    if gl == 'nan': 
        return ("")
    return (gl)


gl_candidate_list = governing_law_rdd_row.map(row_to_tuple_gl_doc)\
                                         .flatMap(lambda sentence: sentence.strip().split(". "))\
                                         .map(extractPhrase)

# Calculate RDF list
gl_rdf_list = gl_candidate_list.map(set)\
                               .flatMap(lambda xs: (x for x in xs))\
                               .map(lambda x: (x, 1))\
                               .reduceByKey(lambda x, y : x + y)\
                               .sortBy(lambda r: r[1], ascending=False)
# gl_candidate_list.take(20)

# Calculate score in each document
gl_edf_list = gl_candidate_list.map(getElectedKeywords)\
                              .flatMap(lambda xs: (x[0] for x in xs))\
                              .map(lambda x: (x, 1))\
                              .reduceByKey(lambda x, y : x + y)\
                              .sortBy(lambda r: r[1], ascending=False)

gl_ess_list = gl_rdf_list.join(gl_edf_list)\
                        .map(lambda x: (x[0], (x[1][1] * x[1][1]) / x[1][0]))\
                        .map(lambda x: (' '.join(x[0]), x[1]))\
                        .sortBy(lambda r: r[1], ascending=False)

gl_ess_list.take(20)

                                                                                

[('agreement shall', 205.29482071713147),
 ('governed', 47.07520891364903),
 ('agreement', 25.623529411764707),
 ('law principles', 24.324324324324323),
 ('laws', 24.31266149870801),
 ('new york', 20.571428571428573),
 ('new york without regard', 20.0),
 ('laws principles', 16.53125),
 ('construed', 16.318725099601593),
 ('law rules', 13.473684210526315),
 ('law provisions', 13.473684210526315),
 ('new york applicable', 12.0),
 ('delaware without regard', 12.0),
 ('california without regard', 12.0),
 ('laws provisions', 9.941176470588236),
 ('substantive laws', 9.333333333333334),
 ('parties hereto shall', 9.0),
 ('performed entirely within', 9.0),
 ('new york without reference', 9.0),
 ('performed wholly within', 8.0)]

## Anti-assignment

In [9]:
def row_to_tuple_aa_doc (row):
    row_dict = row.asDict()
    aa = row_dict["Anti-assignment"]
    if aa == None:
        return ("")
    # Remove page info
    aa = re.sub('\(Page.*?\)', '', aa)
    # Remove continuous space
    aa = re.sub(' +', ' ', aa)
    aa = aa.lower()
    return (aa)

aa_candidate_list = Assignment_CIC_rdd_row.map(row_to_tuple_aa_doc)\
                                         .flatMap(lambda sentence: sentence.strip().split(". "))\
                                         .map(extractPhrase)

# Calculate RDF list
aa_rdf_list = aa_candidate_list.map(set)\
                               .flatMap(lambda xs: (x for x in xs))\
                               .map(lambda x: (x, 1))\
                               .reduceByKey(lambda x, y : x + y)\
                               .sortBy(lambda r: r[1], ascending=False)
# gl_candidate_list.take(20)

# Calculate score in each document
aa_edf_list = aa_candidate_list.map(getElectedKeywords)\
                              .flatMap(lambda xs: (x[0] for x in xs))\
                              .map(lambda x: (x, 1))\
                              .reduceByKey(lambda x, y : x + y)\
                              .sortBy(lambda r: r[1], ascending=False)

aa_ess_list = aa_rdf_list.join(aa_edf_list)\
                        .map(lambda x: (x[0], (x[1][1] * x[1][1]) / x[1][0]))\
                        .map(lambda x: (' '.join(x[0]), x[1]))\
                        .sortBy(lambda r: r[1], ascending=False)

aa_ess_list.take(20)

                                                                                

[('prior written consent', 192.60416666666666),
 ('neither party may assign', 43.0),
 ('either party without', 25.6),
 ('attempted assignment', 23.14814814814815),
 ('agreement without', 21.0125),
 ('agreement may', 20.897959183673468),
 ('either party may assign', 19.17391304347826),
 ('unreasonably withheld conditioned', 15.0),
 ('third party without', 13.473684210526315),
 ('neither party shall assign', 12.0),
 ('express written consent', 12.0),
 ('agreement shall', 11.0),
 ('neither party shall', 11.0),
 ('prior written approval', 10.0),
 ('obligations hereunder without', 9.0),
 ('express prior written consent', 9.0),
 ('violation', 9.0),
 ('consent shall', 8.595238095238095),
 ('party may assign', 8.470588235294118),
 ('third party', 8.16326530612245)]

## Change of Control

In [10]:
def row_to_tuple_cc_doc (row):
    row_dict = row.asDict()
    cc = row_dict["Change of Control"]
    if cc == None:
        return ("")
    # Remove page info
    cc = re.sub('\(Page.*?\)', '', cc)
    # Remove continuous space
    cc = re.sub(' +', ' ', cc)
    cc = cc.lower()
    return (cc)

cc_candidate_list = Assignment_CIC_rdd_row.map(row_to_tuple_cc_doc)\
                                         .flatMap(lambda sentence: sentence.strip().split(". "))\
                                         .map(extractPhrase)

# Calculate RDF list
cc_rdf_list = cc_candidate_list.map(set)\
                               .flatMap(lambda xs: (x for x in xs))\
                               .map(lambda x: (x, 1))\
                               .reduceByKey(lambda x, y : x + y)\
                               .sortBy(lambda r: r[1], ascending=False)
# gl_candidate_list.take(20)

# Calculate score in each document
cc_edf_list = cc_candidate_list.map(getElectedKeywords)\
                              .flatMap(lambda xs: (x[0] for x in xs))\
                              .map(lambda x: (x, 1))\
                              .reduceByKey(lambda x, y : x + y)\
                              .sortBy(lambda r: r[1], ascending=False)

cc_ess_list = cc_rdf_list.join(cc_edf_list)\
                        .map(lambda x: (x[0], (x[1][1] * x[1][1]) / x[1][0]))\
                        .map(lambda x: (' '.join(x[0]), x[1]))\
                        .sortBy(lambda r: r[1], ascending=False)

cc_ess_list.take(20)

                                                                                

[('prior written consent', 9.0),
 ('ownership transfer', 4.0),
 ('agreement may', 3.5),
 ('control� shall mean', 3.0),
 ('control shall', 3.0),
 ('agreement immediately upon', 3.0),
 ('excitehome named competitor', 3.0),
 ('agreement upon written notice', 3.0),
 ('definitive agreement', 3.0),
 ('sole discretion terminate', 3.0),
 ('express written consent', 3.0),
 ('terminated upon', 3.0),
 ('licensor may terminate', 3.0),
 ('written notice', 2.7777777777777777),
 ('merger consolidation', 2.25),
 ('agreement shall terminate', 2.25),
 ('agreement shall', 2.25),
 ('party may assign', 2.0),
 ('c shall assign', 2.0),
 ('village media company', 2.0)]

# Method 2
Each category as a document.

## Governing Law

In [107]:
def row_to_tuple_gl_doc (row):
    row_dict = row.asDict()
    gl = row_dict["Governing Law"]
    # Remove page info
    gl = re.sub('\(Page.*?\)', '', gl)
    # Remove continuous space
    gl = re.sub(' +', ' ', gl)
    gl = gl.lower()
    return (gl)


gl_list = governing_law_rdd_row.map(row_to_tuple_gl_doc)\
                               .flatMap(lambda sentence: sentence.strip().split(". "))

gl_phrase_list = gl_list.map(extractPhrase)

# Get freq(x)
gl_word_freq = gl_phrase_list.map(flattenFirst)\
                             .flatMap(lambda xs: (x for x in xs))\
                             .map(lambda x: (x, 1))\
                             .reduceByKey(lambda x, y : x + y)\
                             .sortBy(lambda r: r[1], ascending=False)

# Get Co-ocurrence of x
gl_co_list = gl_phrase_list.map(getCoocur)\
                            .flatMap(lambda x: ((key, x[key]) for key in x.keys()))\
                            .reduceByKey(lambda x, y : x + y)\
                            .sortBy(lambda r: r[1], ascending=False)

# Get Scores
gl_scores = gl_word_freq.join(gl_co_list)\
                        .map(lambda x: (x[0], (x[1][0] + x[1][1])/x[1][0]))

gl_scores_val = gl_scores.collectAsMap()

def getGLPhraseScore(phrase_list):
    phrase_score = {}
    for phrase in phrase_list:
        phrase_score.setdefault(tuple(phrase), 0)
        for word in phrase:
            phrase_score[tuple(phrase)] += gl_scores_val[word]
    return phrase_score

#Get Ranked list
gl_ranked_list = gl_phrase_list.map(getGLPhraseScore)\
                               .flatMap(lambda x: ((key, x[key]) for key in x.keys()))\
                               .reduceByKey(lambda x, y : x + y)\
                               .map(lambda x: (' '.join(x[0]), x[1]))\
                               .sortBy(lambda r: r[1], ascending=False)
# gl_ranked_list.take(5)

gl_ranked_list.take(20)


                                                                                

[('agreement shall', 962.9690721649507),
 ('laws', 605.8899676375408),
 ('state', 440.44285714285553),
 ('governed', 392.4730077120843),
 ('agreement', 308.3333333333333),
 ('accordance', 294.99661016949),
 ('construed', 281.1111111111112),
 ('new york without regard', 255.36229762389556),
 ('new york', 174.00158604282325),
 ('law principles', 139.19047619047612),
 ('conflict', 128.80000000000027),
 ('conflicts', 119.3684210526318),
 ('new york without reference', 115.39952041723953),
 ('california without regard', 107.16700940070508),
 ('delaware without regard', 106.56363761720901),
 ('laws principles', 103.96601941747576),
 ('new york applicable', 103.64903138099011),
 ('law', 96.42857142857147),
 ('principles', 86.16666666666664),
 ('internal laws', 79.7019057892844)]

## Anti-assignment

In [7]:
def row_to_tuple_aa_doc (row):
    row_dict = row.asDict()
    aa = row_dict["Anti-assignment"]
    if aa == None:
        return ("")
    # Remove page info
    aa = re.sub('\(Page.*?\)', '', aa)
    # Remove continuous space
    aa = re.sub(' +', ' ', aa)
    aa = aa.lower()
    return (aa)

#Get
aa_phrase_list = Assignment_CIC_rdd_row.map(row_to_tuple_aa_doc)\
                                       .flatMap(lambda sentence: sentence.strip().split(". "))\
                                       .map(extractPhrase)

aa_word_freq = aa_phrase_list.map(flattenFirst)\
                             .flatMap(lambda xs: (x for x in xs))\
                             .map(lambda x: (x, 1))\
                             .reduceByKey(lambda x, y : x + y)\
                             .sortBy(lambda r: r[1], ascending=False)

aa_co_list = aa_phrase_list.map(getCoocur)\
                            .flatMap(lambda x: ((key, x[key]) for key in x.keys()))\
                            .reduceByKey(lambda x, y : x + y)\
                            .sortBy(lambda r: r[1], ascending=False)

aa_scores_val = aa_word_freq.join(aa_co_list)\
                        .map(lambda x: (x[0], (x[1][0] + x[1][1])/x[1][0]))\
                        .collectAsMap()


def getAAPhraseScore(phrase_list):
    phrase_score = {}
    for phrase in phrase_list:
        phrase_score.setdefault(tuple(phrase), 0)
        for word in phrase:
            phrase_score[tuple(phrase)] += aa_scores_val[word]
    return phrase_score

#Get Ranked list
aa_ranked_list = aa_phrase_list.map(getAAPhraseScore)\
                               .flatMap(lambda x: ((key, x[key]) for key in x.keys()))\
                               .reduceByKey(lambda x, y : x + y)\
                               .map(lambda x: (' '.join(x[0]), x[1]))\
                               .sortBy(lambda r: r[1], ascending=False)

aa_ranked_list.take(20)

                                                                                

[('prior written consent', 2162.390197970489),
 ('agreement', 682.0576923076922),
 ('party', 492.6967213114752),
 ('neither party may assign', 451.88285445870747),
 ('rights', 364.777472527472),
 ('assign', 325.0687679083097),
 ('agreement without', 324.41688963210714),
 ('either party without', 313.97688617504406),
 ('obligations', 287.2236842105267),
 ('third party', 272.6976989020905),
 ('unreasonably withheld', 259.32690507395637),
 ('either party may assign', 257.5352610126608),
 ('agreement may', 222.11172161172175),
 ('consent shall', 219.0690234667638),
 ('transfer', 209.88744588744558),
 ('consent', 193.8118811881189),
 ('obligations hereunder', 190.81015556847908),
 ('agreement shall', 190.20632373761194),
 ('assignment', 175.5),
 ('obligations hereunder without', 171.8660886492394)]

## Change of Control

In [110]:
def row_to_tuple_cc_doc (row):
    row_dict = row.asDict()
    cc = row_dict["Change of Control"]
    if cc == None:
        return ("")
    # Remove page info
    cc = re.sub('\(Page.*?\)', '', cc)
    # Remove continuous space
    cc = re.sub(' +', ' ', cc)
    cc = cc.lower()
    return (cc)

#Get
cc_phrase_list = Assignment_CIC_rdd_row.map(row_to_tuple_cc_doc)\
                                       .flatMap(lambda sentence: sentence.strip().split(". "))\
                                       .map(extractPhrase)

cc_word_freq = cc_phrase_list.map(flattenFirst)\
                             .flatMap(lambda xs: (x for x in xs))\
                             .map(lambda x: (x, 1))\
                             .reduceByKey(lambda x, y : x + y)\
                             .sortBy(lambda r: r[1], ascending=False)

cc_co_list = cc_phrase_list.map(getCoocur)\
                            .flatMap(lambda x: ((key, x[key]) for key in x.keys()))\
                            .reduceByKey(lambda x, y : x + y)\
                            .sortBy(lambda r: r[1], ascending=False)

cc_scores_val = cc_word_freq.join(cc_co_list)\
                        .map(lambda x: (x[0], (x[1][0] + x[1][1])/x[1][0]))\
                        .collectAsMap()


def getCCPhraseScore(phrase_list):
    phrase_score = {}
    for phrase in phrase_list:
        phrase_score.setdefault(tuple(phrase), 0)
        for word in phrase:
            phrase_score[tuple(phrase)] += cc_scores_val[word]
    return phrase_score

#Get Ranked list
cc_ranked_list = cc_phrase_list.map(getCCPhraseScore)\
                               .flatMap(lambda x: ((key, x[key]) for key in x.keys()))\
                               .reduceByKey(lambda x, y : x + y)\
                               .map(lambda x: (' '.join(x[0]), x[1]))\
                               .sortBy(lambda r: r[1], ascending=False)

cc_ranked_list.take(20)

[('agreement', 216.99555555555557),
 ('change', 191.0526315789476),
 ('control', 164.3999999999997),
 ('prior written consent', 150.3283793938382),
 ('party', 128.02162162162156),
 ('third party', 69.76607642124883),
 ('event', 66.90410958904108),
 ('transfer', 63.25714285714287),
 ('terminate', 62.6470588235294),
 ('agreement may', 60.004444444444445),
 ('written notice', 55.20229633679607),
 ('rights', 45.56862745098038),
 ('sale', 43.571428571428555),
 ('merger', 42.0943396226415),
 ('substantially', 40.526315789473685),
 ('effective date', 39.325),
 ('notice', 38.96774193548387),
 ('assignment', 38.07692307692306),
 ('merger consolidation succession', 36.920526014865644),
 ('assets', 35.57894736842106)]