## Regular Expressions

Defining REs in Python is straightforward:

In [None]:
import re # regular expression

# here we have two different regular expressions
pattern = re.compile('[bcrh]at')
pattern2 = re.compile('(.*)([bcrh]at)(.*)')

We can then use the pattern to `search()` or `match()` strings to it. 

`search()` will return a result if the pattern occurs **anywhere** in the input string.

`match()` will only return a result if the pattern **completely** matches the input string.

In [None]:
word = 'the batter won the game'
# word = 'hi how are you'
matches = re.match(pattern2, word) # won't return a a result, i.e., matches = None
searches = re.search(pattern, word) # finds a substring

In [None]:
print(matches.groups()) # groups are identified by parenthesis
# we can split in groups
print(searches)


('the ', 'bat', 'ter won the game')
<re.Match object; span=(4, 7), match='bat'>


Both have a number of attributes to access the results. 
- `span()` gives us a tuple of the substring that matches
- `group()`returns the matched substring

In [None]:
span = searches.span()
word[span[0]:span[1]], span

('bat', (4, 7))

In [None]:
searches.group()

'bat'

If we have used several RE groups (in brackets `()`), we can access them individually via `groups()`

In [None]:
word = 'preconstitutionalism'
affixes = re.compile('(...).+(...)')
re.search(affixes, word).groups() # we will obtains the groups: first (...) and end (...)

('pre', 'ism')

For the email address finder, we can use a more advanced pattern and test it:

In [None]:
email = re.compile('^[A-Za-z0-9][A-Za-z0-9\.-]*@[A-Za-z0-9][A-Za-z0-9\.-]+\.[A-Za-z0-9\.-][A-Za-z0-9\.-][A-Za-z0-9\.-]?$')
# for address in ['me.@unibocconi.it', '@web.de', '.@gmx.com', 'not working@aol.com']:

for address in 'notMyFault@webmail.com,smithie123@gmx,Free stuff@unibocconi.it,mark_my_words@hotmail;com,truthOrDare@webmail.in,look@me@twitter.com,how2GetAnts@aol.dfdsfgfdsgfd'.split(','):
    print(address, re.match(email, address))

notMyFault@webmail.com <re.Match object; span=(0, 22), match='notMyFault@webmail.com'>
smithie123@gmx None
Free stuff@unibocconi.it None
mark_my_words@hotmail;com None
truthOrDare@webmail.in <re.Match object; span=(0, 22), match='truthOrDare@webmail.in'>
look@me@twitter.com None
how2GetAnts@aol.dfdsfgfdsgfd None


We can also use the pattern to replace elements of a string that match with `sub()`

In [None]:
# in this way we are strongly depended on the expr inside the method
print('Are you all awake?'.replace('?', '!'))  

# here we do same stuff but we have generalized for each 
# possible combination of digits
numbers = re.compile('[0-9]') # single digit
re.sub(numbers, '0', 'Back in the 90s, when I was a 12-year-old, a CD cost just 15,99EUR!')

Are you all awake!


'Back in the 00s, when I was a 00-year-old, a CD cost just 00,00EUR!'

## Exercise

Write a RegEx to remove all user names from the tweets and replace them with the token "@USER"

In [None]:
! pip install wget

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp37-none-any.whl size=9681 sha256=b1989b41eace70e0d4c3550bd2a68544506b6e390c0960994d137643afbd21db
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [None]:
import wget
url = 'https://raw.githubusercontent.com/dirkhovy/NLPclass/master/data/tweets_en.txt'
wget.download(url, 'tweets_en.txt')
tweets = [line.strip() for line in open('tweets_en.txt', encoding='utf8')]

In [None]:
tweets[:10]

['@cosmetic_candy I think a lot of people just enjoy being a pain in the ass on there',
 'Best get ready sunbed and dinner with nana today :)',
 '@hardlyin70 thats awesome!',
 'Loving this weather',
 '“@danny_boy_37: Just seen an absolute idiot in shorts! Be serious!” Desperado gentleman',
 '@SamanthaOrmerod trying to resist a hardcore rave haha! Resisting towns a doddle! Posh dance floor should wear them in quite easy xx',
 '59 days until @Beyonce!!! Wooo @jfracassini #cannotwait',
 'That was the dumbest tweet i ever seen',
 'Oh what to do on this fine sunny day?',
 '@Brooke_C_X hows the fish ? Hope they r ok. Xx']

In [None]:
# your code here
user_pattern = re.compile('@[A-Za-z0\.-9_]+')
print(tweets[0])
print(re.match(user_pattern, tweets[0]))
re.sub(user_pattern, "@USER", tweets[0])

for tweet in tweets:
  print(re.sub(user_pattern, "@USER", tweet)) # sub is not inplace

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
@USER me too in #nyc gutted to be flying home to UK very early tomorrow and missing your gig. I'm sure it will be amazing as ever.
Wits the script for today?
@USER I even get with all the wall running stuff you need walls but destructibility in a shooter changes tactics and keeps it interest
@USER lost it out in the grass thought it was gone hah
I'm at Stratton House Hotel Biggleswade (Biggleswade) http://t.co/pYRW7ITvOI
FINALLY got John greens "fault in our stars" and "looking for Alaska"
I feel like a bear with a sore head. Had real fun at GLADD drinks and met some lovely people - will definitely be going again :)
@USER @USER I'm laughing at how much he's tweeting today 😂
@USER good! And ow I will be don't worry ehe x
@USER me too ahhh what day shall we go out ;) xxx
Dad won't stop talking about gears, go away not in the mood #hanging
@USER I know babygurl, we can bond even more :):):):):):)
@USER what sorta time you thi

Now, write a RegEx to extract all user names from the tweets


In [None]:
# your code here
list_mentions = []
for tweet in tweets:
  if re.search(user_pattern, tweet) is not None:
    list_mentions.append(re.search(user_pattern, tweet).group())

list_mentions

['@cosmetic_candy',
 '@hardlyin70',
 '@danny_boy_37',
 '@SamanthaOrmerod',
 '@Beyonce',
 '@Brooke_C_X',
 '@Jbowe_',
 '@louise_munchi',
 '@guy_clifton',
 '@StephanieLee__',
 '@peterbaird5',
 '@GreigSweeney',
 '@LukeyR9',
 '@michalgarnett',
 '@hannahkerr3',
 '@ScarletSophie',
 '@SW16Massage',
 '@JBieberBubba',
 '@Bulliopr',
 '@correr46',
 '@nataliemunro19',
 '@elliemilne',
 '@EllysCutmore',
 '@yvonne_88',
 '@charleybarley20',
 '@FreakinMerv',
 '@LewisKing17',
 '@Stemo666',
 '@JoMcGrann',
 '@RyanLumbEvans',
 '@justinbieber',
 '@_connorwynn',
 '@_rkells',
 '@Schofe',
 '@beccyhawk',
 '@RoyalNobRecords',
 '@_baldyy',
 '@ellieburns',
 '@CharlieSloth',
 '@SarahFrenchyy',
 '@katie_seymour',
 '@jamieburt_93',
 '@JohnButt7',
 '@becki_lowe',
 '@Babyyy_Cakees',
 '@jademchalexx',
 '@JONBOOGIEE',
 '@Nekhanimal',
 '@indiachesterman',
 '@vicky_outen',
 '@CarDealerEd',
 '@BenTravesy',
 '@SarahhLangleyy',
 '@CharlieBrady96',
 '@RyanRobinson02',
 '@naomi_mariex',
 '@MatchroomBoxing',
 '@jordanbowers_',
 '

## Exercise

Write a RegEx to search for all hashtags containing the word `good` in them.

In [None]:
# your code here
pattern = '[\W]*(goo+d)(?i)[\W]*'
ash_pattern = re.compile(pattern)

for tweet in tweets:
  if re.findall(ash_pattern, tweet):
    print(tweet, re.findall(ash_pattern, tweet))

  This is separate from the ipykernel package so we can avoid doing imports until


@SW16Massage sounds good, I want some of that too... ['good']
@EllysCutmore Happy Birthday Ellys have a gooden 👌 ['good']
@RyanLumbEvans I will follow is a good un ['good']
@BenTravesy good night mate? ['good']
@MatchroomBoxing @brian_lion_rose @skysportsboxing good test for him that is! @mattmacklin walked through alcine in his last fight! ['good']
My grandads been talking to me about bread for a good 20 minutes now and he didn't even notice me fall asleep for 5 minutes #saveme ['good']
@tomwookieford have you come across Gta Spano yet,,,any special editions or anything on their stand, PS new ssangyong Rodius looks good ['good']
@HelloNikto lol no I'm a good boy:)! Think I'm just getting a fine:) ['good']
“@PiscesAreUs: We might as well start with the good news: #Pisces are great in the sack.” HHHAaahahaHAHAA😶😶😶😶😶😶😶 ['good']
Having lunch with my lovely friend, champneys booked for May. It's a summers day and life is good .. ['good']
@paz_leigh i always made mine have an affair with a 

In [None]:
for tweet in tweets:
  if not (re.search(ash_pattern, tweet) is None):
    print(re.findall(ash_pattern, tweet))
    print(re.search(ash_pattern, tweet).group())

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
['good']
 #good
['good']
 good 😐😋😊
['good']
 good
['Good']
Good 
['good']
 good 
['good']
 good 
['Good']
Good 
['good']
 good 
['good']
 good 
['Good']
 Good 
['good']
 good 
['good']
 good 
['good']
 good 
['good']
 good 
['Good']
. Good 
['Good']
. Good 
['good']
 good 
['Good']
? Good
['good']
 good' #
['good']
 good 
['good']
, good 
['good']
 good 
['good']
 good 
['Good']
Good 
['good']
 good 
['good']
 good! @
['good']
 good 
['good']
 good 
['good']
 good 
['good']
 good 
['Good']
Good 
['good']
 good, 
['good']
 good 
['good']
 good 
['good']
 good 
['good']
 good 
['good']
 good 
['good']
 good 
['good']
 good 
['good']
 good. #
['good']
 good 
['good']
 good 
['good']
 good 
['good']
 good. 
['Good']
Good
['good']
 good 
['good']
 good 
['good']
 good 
['good']
 good,
['good']
, good 
['good']
 good 
['good']
 good 
['Good']
Good 
['Good']
. Good 
['good']
 good #
['good']
 good 
['good']
 good 
['Good']
! Good

## TF-IDF

Let's extract the most important words from Moby Dick

In [None]:
url = 'https://raw.githubusercontent.com/dirkhovy/NLPclass/master/data/moby_dick.txt'
wget.download(url, 'moby_dick.txt')

'moby_dick.txt'

In [None]:
import pandas as pd
documents = [line.strip() for line in open('moby_dick.txt', encoding='utf8')]
print(documents[1])

Call me Ishmael .


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(analyzer='word',
                                   min_df=0.001,
                                   max_df=0.75,
                                   stop_words='english'
                                   )

X = tfidf_vectorizer.fit_transform(documents)

In [None]:
X.shape

(9768, 1850)

Now, let's get the same information as raw counts:

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='word', min_df=0.001, max_df=0.75, stop_words='english')

X2 = vectorizer.fit_transform(documents)

In [None]:
X.shape, X2.shape # same features (number of columns are same)

((9768, 1850), (9768, 1850))

In [None]:
len(X.sum(axis=0).A1)

1850

In [None]:
len(X2.sum(axis=0).A1)

1850

In [None]:
df = pd.DataFrame(data={'word': vectorizer.get_feature_names(), 
                        'tf': X2.sum(axis=0).A1, 
                        'idf': tfidf_vectorizer.idf_,
                        'tfidf': X.sum(axis=0).A1
                       })

In [None]:
df = df.sort_values(['tfidf', 'tf', 'idf'], ascending=False)
df

Unnamed: 0,word,tf,idf,tfidf
1782,whale,1150,3.262357,227.810433
1838,ye,467,4.257380,156.303115
231,chapter,171,5.039475,148.326709
972,man,525,3.982412,134.457265
922,like,639,3.808543,134.103472
...,...,...,...,...
1423,shortly,10,7.789074,2.994518
1735,valiant,10,7.789074,2.938086
554,fleet,11,7.702063,2.917283
1602,surprise,10,7.789074,2.871766


In [None]:
df = df.sort_values(['tf', 'idf'], ascending=False)
df

Unnamed: 0,word,tf,idf,tfidf
1782,whale,1150,3.262357,227.810433
922,like,639,3.808543,134.103472
972,man,525,3.982412,134.457265
21,ahab,511,4.019453,131.659834
1414,ship,509,4.006953,111.576500
...,...,...,...,...
1192,pitched,10,7.789074,3.011663
1423,shortly,10,7.789074,2.994518
1735,valiant,10,7.789074,2.938086
1602,surprise,10,7.789074,2.871766


## Exercise
Extract **only** the bigrams (no unigrams) from Moby Dick and find the top 10 in terms of TF-IDF.

In [None]:
# your code 
tfidf_vectorizer = TfidfVectorizer(analyzer='word',
                                   min_df=0.001,
                                   max_df=0.75,
                                   stop_words='english',
                                   ngram_range=(2, 2)
                                   )

X3 = tfidf_vectorizer.fit_transform(documents)

vectorizer = CountVectorizer(analyzer='word', min_df=0.001, max_df=0.75, stop_words='english', ngram_range=(2, 2))
X4 = vectorizer.fit_transform(documents)

df2 = pd.DataFrame(data={'word': vectorizer.get_feature_names(), 
                        'tf': X4.sum(axis=0).A1, 
                        'idf': tfidf_vectorizer.idf_,
                        'tfidf': X3.sum(axis=0).A1
                       })

df2 = df2.sort_values(['tfidf'], ascending=False)
df2[:10]

Unnamed: 0,word,tf,idf,tfidf
56,sperm whale,176,5.051171,143.508986
73,white whale,106,5.581799,89.735125
43,old man,81,5.78025,73.239287
37,moby dick,83,5.817522,68.828613
8,captain ahab,64,6.043835,53.891712
48,right whale,55,6.161618,46.324282
35,mast head,47,6.315768,41.35129
36,mast heads,36,6.576051,32.451617
12,cried ahab,33,6.660609,31.30819
71,whale ship,33,6.660609,29.261082


## PMI
Extracting PMI from text is relatively straightforward, and `nltk` offer some functions to do so flexibly.

In [None]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

True

In [None]:
[word for document in documents[:10] for word in document.split() if word not in stopwords_]

In [None]:
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
from nltk.corpus import stopwords
from collections import Counter

stopwords_ = set(stopwords.words('english'))

words = [word.lower() for document in documents for word in document.split() 
         if len(word) > 2 
         and word not in stopwords_]
         
finder = BigramCollocationFinder.from_words(words)
bgm = BigramAssocMeasures()
score = bgm.mi_like # mutual information like measure
collocations = {'_'.join(bigram): pmi for bigram, pmi in finder.score_ngrams(score)}
collocations

{'moby_dick': 83.0,
 'sperm_whale': 20.002847184002935,
 'mrs_hussey': 10.5625,
 'mast_heads': 4.391152941176471,
 'sag_harbor': 4.0,
 'vinegar_cruet': 4.0,
 'try_works': 3.7944046844502277,
 'dough_boy': 3.7067873303167422,
 'white_whale': 3.698807453416149,
 'caw_caw': 3.4722222222222223,
 'samuel_enderby': 3.4285714285714284,
 'cape_horn': 3.4133333333333336,
 'new_bedford': 3.3402061855670104,
 'quarter_deck': 3.2339339991315676,
 'deacon_deuteronomy': 3.2,
 'father_mapple': 3.0,
 'gamy_jesty': 3.0,
 'hoky_poky': 3.0,
 'jesty_joky': 3.0,
 'joky_hoky': 3.0,
 'sporty_gamy': 3.0,
 'sulk_pout': 3.0,
 'twos_threes': 3.0,
 'mast_head': 2.464640949554896,
 '000_lbs': 2.45,
 'chief_mate': 2.4075114075114077,
 'old_man': 2.269660474055093,
 'straits_sunda': 2.25,
 'crow_nest': 2.227272727272727,
 'crested_comb': 2.0,
 'daboll_arithmetic': 2.0,
 'distension_contraction': 2.0,
 'gemini_twins': 2.0,
 'helter_skelter': 2.0,
 'hogs_bristles': 2.0,
 'kith_kin': 2.0,
 'lirra_skirra': 2.0,
 'pell_m

In [None]:
finder.score_ngrams(score) # same information as before, better using a dict

[(('moby', 'dick'), 83.0),
 (('sperm', 'whale'), 20.002847184002935),
 (('mrs', 'hussey'), 10.5625),
 (('mast', 'heads'), 4.391152941176471),
 (('sag', 'harbor'), 4.0),
 (('vinegar', 'cruet'), 4.0),
 (('try', 'works'), 3.7944046844502277),
 (('dough', 'boy'), 3.7067873303167422),
 (('white', 'whale'), 3.698807453416149),
 (('caw', 'caw'), 3.4722222222222223),
 (('samuel', 'enderby'), 3.4285714285714284),
 (('cape', 'horn'), 3.4133333333333336),
 (('new', 'bedford'), 3.3402061855670104),
 (('quarter', 'deck'), 3.2339339991315676),
 (('deacon', 'deuteronomy'), 3.2),
 (('father', 'mapple'), 3.0),
 (('gamy', 'jesty'), 3.0),
 (('hoky', 'poky'), 3.0),
 (('jesty', 'joky'), 3.0),
 (('joky', 'hoky'), 3.0),
 (('sporty', 'gamy'), 3.0),
 (('sulk', 'pout'), 3.0),
 (('twos', 'threes'), 3.0),
 (('mast', 'head'), 2.464640949554896),
 (('000', 'lbs'), 2.45),
 (('chief', 'mate'), 2.4075114075114077),
 (('old', 'man'), 2.269660474055093),
 (('straits', 'sunda'), 2.25),
 (('crow', 'nest'), 2.2272727272727

In [None]:
Counter(collocations).most_common(20)

[('moby_dick', 83.0),
 ('sperm_whale', 20.002847184002935),
 ('mrs_hussey', 10.5625),
 ('mast_heads', 4.391152941176471),
 ('sag_harbor', 4.0),
 ('vinegar_cruet', 4.0),
 ('try_works', 3.7944046844502277),
 ('dough_boy', 3.7067873303167422),
 ('white_whale', 3.698807453416149),
 ('caw_caw', 3.4722222222222223),
 ('samuel_enderby', 3.4285714285714284),
 ('cape_horn', 3.4133333333333336),
 ('new_bedford', 3.3402061855670104),
 ('quarter_deck', 3.2339339991315676),
 ('deacon_deuteronomy', 3.2),
 ('father_mapple', 3.0),
 ('gamy_jesty', 3.0),
 ('hoky_poky', 3.0),
 ('jesty_joky', 3.0),
 ('joky_hoky', 3.0)]

## Exercise

Extract the top 10 collocations for the Twitter data. You need to preprocess the data first!

In [None]:
! pip install emoji

Collecting emoji
[?25l  Downloading https://files.pythonhosted.org/packages/24/fa/b3368f41b95a286f8d300e323449ab4e86b85334c2e0b477e94422b8ed0f/emoji-1.2.0-py3-none-any.whl (131kB)
[K     |██▌                             | 10kB 13.7MB/s eta 0:00:01[K     |█████                           | 20kB 13.0MB/s eta 0:00:01[K     |███████▌                        | 30kB 8.7MB/s eta 0:00:01[K     |██████████                      | 40kB 7.3MB/s eta 0:00:01[K     |████████████▌                   | 51kB 4.3MB/s eta 0:00:01[K     |███████████████                 | 61kB 4.8MB/s eta 0:00:01[K     |█████████████████▌              | 71kB 4.8MB/s eta 0:00:01[K     |████████████████████            | 81kB 5.2MB/s eta 0:00:01[K     |██████████████████████▌         | 92kB 5.4MB/s eta 0:00:01[K     |█████████████████████████       | 102kB 4.1MB/s eta 0:00:01[K     |███████████████████████████▌    | 112kB 4.1MB/s eta 0:00:01[K     |██████████████████████████████  | 122kB 4.1MB/s eta 0:00:

In [None]:
import spacy
nlp = spacy.load('en')

from nltk import SnowballStemmer # most famous for stemming
stemmer = SnowballStemmer('english')

import emoji

In [None]:
def preprocess(tweet):

  numbers = re.compile('[0-9]+') # digit
  users = re.compile('@[A-Za-z0\.-9_]+') # users tweet name
  puncts = re.compile('[\'/".,;!?\\:-]') # punctuations
  parens = re.compile('[()[\]{}]') # parenthesis
  hashes = re.compile('#[A-Za-z0\.-9_\']+') # hashes tweet
  links = re.compile('http://t.co\/[^\s]+') # tweeter links
  dates1 = re.compile('[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}') # ex. 2003-08-06
  dates2 = re.compile('[A-Z][a-z][a-z] [0-9][0-9]*, [0-9]\{4\}') # ex. Jan 3, 2003
  dates3 = re.compile('^(\d{1,2})\/(\d{1,2})\/(\d{2}|(19|20)\d{2})$') # ex. DD/MM/YY or DD/MM/YYYY or MM/DD/YY or MM/DD/YYYY

  tweet_string = tweet.lower()

  # remove emojies
  for word in tweet_string:
    if word in emoji.UNICODE_EMOJI_ALIAS_ENGLISH:
      tweet_string = tweet_string.replace(word, '')

  tweet_string = re.sub(users, '', tweet_string)
  tweet_string = re.sub(hashes, '', tweet_string)
  tweet_string = re.sub(links, '', tweet_string)

  tweet_string = re.sub(dates1, '', tweet_string)
  tweet_string = re.sub(dates2, '', tweet_string)
  tweet_string = re.sub(dates3, '', tweet_string)

  tweet_string = re.sub(numbers, '', tweet_string)
  tweet_string = re.sub(puncts, '', tweet_string)
  tweet_string = re.sub(parens, '', tweet_string)

  tweet_string = ' '.join(tweet_string.split()) # remove white space

  # remove stop words and no-sense words
  tweet_list = [token.text for token in nlp(tweet_string) if len(token) > 2 and not token.is_stop]
  # it cut a word but should no does that...

#  print(tweet)
#  print(tweet_string)
#  print(tweet_list)

  return tweet_list

# in tweets[10] there is a graphical emoji
# in tweets[17] there is a string emoji

tweet = tweets[55]
preprocess(tweet)

['wait',
 'justin',
 'world',
 'tour',
 'buy',
 'm&ampg',
 'able',
 'words',
 'met',
 'justin']

In [None]:
words = []
for tweet in tweets:
  words += preprocess(tweet)

words = list(set(words)) # make a unique list of words
words

KeyboardInterrupt: ignored

In [None]:
finder = BigramCollocationFinder.from_words(words)
bgm = BigramAssocMeasures()
score = bgm.mi_like # mutual information like measure

In [None]:
collocations = {'_'.join(bigram) : pmi for bigram, pmi in finder.score_ngrams(score)}
Counter(collocations).most_common(10)