In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
from matplotlib import pyplot as plt
import matplotlib as mpl
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
sdg_names = pd.read_excel("Digital Science SDG training set searches.xlsx")
sdg_names

Unnamed: 0.1,Unnamed: 0,Each tab contains the keywords/phrases used for generating the training set.,Unnamed: 2
0,,,
1,,,
2,Goal,Name,
3,1,No Poverty,End poverty in all its forms everywhere
4,2,Zero Hunger,"End hunger, achieve food security and improved..."
5,3,Good Health and Well Being,Ensure healthy lives and promote well-being fo...
6,4,Quality Education,Ensure inclusive and equitable quality educati...
7,5,Gender Equality,Achieve gender equality and empower all women ...
8,6,Clean Water and Sanitation,Ensure availability and sustainable management...
9,7,Affordable and Clean Energy,"Ensure access to affordable, reliable, sustain..."


In [3]:
sdg_names = sdg_names.drop([0,1,2], axis=0)
sdg_names = sdg_names.set_axis(["sdg", "sdg_name", "sdg_definition"],axis=1, copy=False)
sdg_names

Unnamed: 0,sdg,sdg_name,sdg_definition
3,1,No Poverty,End poverty in all its forms everywhere
4,2,Zero Hunger,"End hunger, achieve food security and improved..."
5,3,Good Health and Well Being,Ensure healthy lives and promote well-being fo...
6,4,Quality Education,Ensure inclusive and equitable quality educati...
7,5,Gender Equality,Achieve gender equality and empower all women ...
8,6,Clean Water and Sanitation,Ensure availability and sustainable management...
9,7,Affordable and Clean Energy,"Ensure access to affordable, reliable, sustain..."
10,8,Decent Work and Economic Growth,"Promote sustained, inclusive and sustainable e..."
11,9,"Industry, Innovation and Infrastructure","Build resilient infrastructure, promote inclus..."
12,10,Reduced Inequalities,Reduce inequality within and among countries


In [4]:
text_df = pd.read_csv("/Users/yingli/Downloads/osdg-community-dataset-v21-09-30.csv", sep = "\t")
text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32120 entries, 0 to 32119
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   doi              32120 non-null  object 
 1   text_id          32120 non-null  object 
 2   text             32120 non-null  object 
 3   sdg              32120 non-null  int64  
 4   labels_negative  32120 non-null  int64  
 5   labels_positive  32120 non-null  int64  
 6   agreement        32120 non-null  float64
dtypes: float64(1), int64(3), object(3)
memory usage: 1.7+ MB


In [5]:
text_df = pd.read_csv("/Users/yingli/Downloads/osdg-community-data-v2023-01-01.csv", sep = "\t", quotechar='"')
text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40062 entries, 0 to 40061
Data columns (total 1 columns):
 #   Column                                                          Non-Null Count  Dtype 
---  ------                                                          --------------  ----- 
 0   doi	text_id	text	sdg	labels_negative	labels_positive	agreement  40062 non-null  object
dtypes: object(1)
memory usage: 313.1+ KB


### data cleaning

In [6]:
text_df = pd.read_csv("/Users/yingli/Downloads/osdg-community-data-v2023-01-01.csv", sep = "\t", quotechar='"')
col_names = text_df.columns.values[0].split('\t')
text_df[col_names] = text_df[text_df.columns.values[0]].apply(lambda x: pd.Series(str(x).split("\t")))
text_df.drop(text_df.columns.values[0],axis = 1, inplace=True)
text_df = text_df.astype({'sdg':int, 'labels_negative': int, 'labels_positive':int, 'agreement': float}, copy=True)
text_df = text_df.query("agreement > 0.5 and (labels_positive - labels_negative) > 2")
text_df.reset_index(inplace=True)
text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24669 entries, 0 to 24668
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   index            24669 non-null  int64  
 1   doi              24669 non-null  object 
 2   text_id          24669 non-null  object 
 3   text             24669 non-null  object 
 4   sdg              24669 non-null  int64  
 5   labels_negative  24669 non-null  int64  
 6   labels_positive  24669 non-null  int64  
 7   agreement        24669 non-null  float64
dtypes: float64(1), int64(4), object(3)
memory usage: 1.5+ MB


### detecting language in the text

In [7]:
from langdetect import detect
print(detect("War doesn't show who's right, just who's left."))
print(detect("Ein, zwei, drei, vier"))
print(detect('四个和尚没水喝'))
print(detect('今一はお前さん'))

en
de
zh-cn
ja


 - add a column in data frame for the corresponding language

In [8]:
from langdetect import detect
text_df["lang"] = text_df["text"].apply(lambda x: detect(x))
text_df.lang.value_counts()

en    24620
fr       40
es        5
de        1
et        1
fi        1
id        1
Name: lang, dtype: int64

### should we filter out those non-english texts?

In [9]:
text_df.query("lang== 'fr'")["text"]

152      Une telle decomposition par categorie UICN n'e...
581      Le papier discute ensuite les choix methodolog...
1556     Les problemes de sante mentale legers a modere...
3148     Cependant, le developpement de mecanismes de r...
5026     Par contre, on n’aper^oit pas une difference c...
5648     Moreover, a budget-neutral redistribution of f...
6191     Il existe une raison supplementaire pour mettr...
7160     A cette fin, il est divise en trois sections. ...
7364     Les taux d’emploi feminins reagissent aux vari...
7593     The emphasis is on understanding how increasin...
8619     Entre 2007 et 2012, le niveau de vie median de...
8661     Enfin, il analyse, a partir des Statistiques d...
8774     This paper looks at the features of good caree...
9387     La mise en oeuvre de l'enquete se fait de mani...
9718     Par ailleurs, les resultats suggerent que l’ef...
10179    La protection economique des femmes en age ava...
10425    "En particulier, ce qui importe le plus est l'.

 - use str.find to locate the position of a substring in a string, negative return indicate not found

In [10]:
text_df[text_df["text"].apply(lambda x: x.find('四个全面')) > 0]["text"]

18009    In China today,President Xi Jinping’s new gran...
Name: text, dtype: object

In [11]:
t1 = text_df[text_df["text"].apply(lambda x: x.find('四个全面')) > 0]["text"].values[0]
print(t1 == text_df.iloc[18009].text)
print(t1 is text_df.iloc[18009].text)
print(t1 == text_df.iloc[18009].text.casefold())
print(t1.lower() == text_df.iloc[18009].text.casefold())

True
True
False
True


In [12]:
print(t1.casefold())

in china today,president xi jinping’s new grand narrative is framed by the widely publicised ‘four comprehensives’ (四个全面, ‘sigequanmian’). this narrative aims to : 1.build a moderately prosperous society, 2. deepen reform, 3. govern the nation according to law, and 4. tighten party discipline. it is essentially a political narrative that tells a moral tale that legitimizes and glorifies the virtues of the present. it also attempts to shrug off mistakes of the previous dynasty. drawing on the legal disciplines of economic law, international comparative law and the a priori analytic method of legal narrative analysis, this paper provides a critical appraisal of the ‘four comprehensives’, paying special attention to how the four principal strands of the narrative shape the directions of china’s socialist rule of law reforms and governance.  key words: chinese law reform, four comprehensives, 四个全面, sigequanmian,socialist rule of law,legal narrative analysis, economic-law nexus, south china

In [13]:
text = 'groß'

# convert text to lowercase using casefold()
print('Using casefold():', text.casefold()) 

# convert text to lowercase using lower()
print('Using lower():', text.lower())

Using casefold(): gross
Using lower(): groß


In [14]:
text_df.iloc[3].text.lower()

'the israel oceanographic and limnological research station monitors the quantity and quality of water along the coastline of the mediterranean sea. the nature and parks authority (npa) monitors water quality in rivers on behalf of the moep. mekorot and local authorities monitor drinking water quality under the supervision of the ministry of health. the ministry of health monitors effluent quality prior to its use in the agricultural sector.'

 - use google translate

In [15]:
from googletrans import Translator
t = Translator().detect('四个全面')
print(t.lang, t.confidence)
print(Translator().detect("War doesn't show who's right, just who's left."))
print(Translator().detect("Ein, zwei, drei, vier"))
print(Translator().detect('四个和尚没水喝'))
print(Translator().detect('今一はお前さん'))

zh-CN 1
Detected(lang=en, confidence=1)
Detected(lang=de, confidence=1)
Detected(lang=zh-CN, confidence=1)
Detected(lang=ja, confidence=1)


 - use fasttext to detect language
 - it can return to k languages, by probability

In [16]:
import fasttext
model = fasttext.load_model('lid.176.ftz')
print(model.predict(t1, k=3))
print(model.predict("In China today,President Xi Jinping’s new grand narrative is framed by the widely publicised ‘four comprehensives’ (四个全面, ‘sigequanmian’)."))
print(model.predict('四个全面', k=3))
print(model.predict('الشمس تشرق', k=2))  # top 2 matching languages
print(model.predict("影響包含對氣候的變化以及自然資源的枯竭程度",k=2))  # top 2 matching languages


(('__label__en', '__label__fr', '__label__es'), array([0.87778687, 0.00653897, 0.00630662]))
(('__label__en',), array([0.84054291]))
(('__label__zh', '__label__ja'), array([0.68349081, 0.31656581]))
(('__label__ar', '__label__fa'), array([0.98124713, 0.01265871]))
(('__label__zh', '__label__fr'), array([9.99575794e-01, 2.11721170e-04]))




 - use SpaCy to tag part of speech

In [17]:
import spacy
import en_core_web_sm
from spacy_langdetect import LanguageDetector
nlp = spacy.load("en_core_web_sm")
nlp = en_core_web_sm.load()
doc = nlp("This is a sentence.")
print([(w.text, w.pos_) for w in doc])
text_content = "Er lebt mit seinen Eltern und seiner Schwester in Berlin."
doc = nlp(text_content) #3
print([(w.text, w.pos_) for w in doc])
doc = nlp(t1)
print([(w.text, w.pos_) for w in doc])


[('This', 'PRON'), ('is', 'AUX'), ('a', 'DET'), ('sentence', 'NOUN'), ('.', 'PUNCT')]
[('Er', 'INTJ'), ('lebt', 'NOUN'), ('mit', 'NOUN'), ('seinen', 'NOUN'), ('Eltern', 'PROPN'), ('und', 'NOUN'), ('seiner', 'NOUN'), ('Schwester', 'PROPN'), ('in', 'ADP'), ('Berlin', 'PROPN'), ('.', 'PUNCT')]
[('In', 'ADP'), ('China', 'PROPN'), ('today', 'NOUN'), (',', 'PUNCT'), ('President', 'PROPN'), ('Xi', 'PROPN'), ('Jinping', 'PROPN'), ('’s', 'PART'), ('new', 'ADJ'), ('grand', 'ADJ'), ('narrative', 'NOUN'), ('is', 'AUX'), ('framed', 'VERB'), ('by', 'ADP'), ('the', 'DET'), ('widely', 'ADV'), ('publicised', 'VERB'), ('‘', 'PUNCT'), ('four', 'NUM'), ('comprehensives', 'NOUN'), ('’', 'PUNCT'), ('(', 'PUNCT'), ('四个全面', 'NOUN'), (',', 'PUNCT'), ('‘', 'PUNCT'), ('sigequanmian', 'ADJ'), ('’', 'PUNCT'), (')', 'PUNCT'), ('.', 'PUNCT'), ('This', 'DET'), ('narrative', 'NOUN'), ('aims', 'VERB'), ('to', 'PART'), (':', 'PUNCT'), ('1.build', 'NUM'), ('a', 'DET'), ('moderately', 'ADV'), ('prosperous', 'ADJ'), ('soci

In [18]:
doc = nlp(t1)
for i, sent in enumerate(doc.sents):
    print(i, len(sent), sent)

0 29 In China today,President Xi Jinping’s new grand narrative is framed by the widely publicised ‘four comprehensives’ (四个全面, ‘sigequanmian’).
1 32 This narrative aims to : 1.build a moderately prosperous society, 2. deepen reform, 3. govern the nation according to law, and 4. tighten Party discipline.
2 21 It is essentially a political narrative that tells a moral tale that legitimizes and glorifies the virtues of the present.
3 12 It also attempts to shrug off mistakes of the previous dynasty.
4 63 Drawing on the legal disciplines of economic law, international comparative law and the a priori analytic method of legal narrative analysis, this paper provides a critical appraisal of the ‘four comprehensives’, paying special attention to how the four principal strands of the narrative shape the directions of China’s socialist rule of law reforms and governance.  
5 32 Key words: Chinese law reform, four comprehensives, 四个全面, sigequanmian,socialist rule of law,legal narrative analysis, 

In [19]:
from nltk import stem

tokens =  ['narrative', 'comprehensives', 'publicised', 'moderately', 'prosperous', 'legitimizes', 'according', 'glorifies'] 

porter = stem.porter.PorterStemmer() # Porter stemmer
snowball = stem.snowball.EnglishStemmer() # Snowball stemmer
lancaster = stem.lancaster.LancasterStemmer() # Lancaster stemmer

print('Porter Stemmer:', [porter.stem(i) for i in tokens])
print('Snowball Stemmer:', [snowball.stem(i) for i in tokens])
print('Lancaster Stemmer:', [lancaster.stem(i) for i in tokens])


Porter Stemmer: ['narr', 'comprehens', 'publicis', 'moder', 'prosper', 'legitim', 'accord', 'glorifi']
Snowball Stemmer: ['narrat', 'comprehens', 'publicis', 'moder', 'prosper', 'legitim', 'accord', 'glorifi']
Lancaster Stemmer: ['nar', 'comprehend', 'publ', 'mod', 'prosp', 'legitim', 'accord', 'glor']


In [20]:
t2='start'
t2.startswith('S')
'w' in t2

False

In [21]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [22]:
import textblob


In [23]:
from sklearn.feature_extraction.text import CountVectorizer
docs = text_df.text
count_vectorizer = CountVectorizer()
count_vectorizer.fit(docs)  #print('vocabulary: ', count_vectorizer.vocabulary_)
count_vector = count_vectorizer.transform(docs) #print('vectors: ', count_vector.toarray())


In [24]:
docs = text_df.text
count_vectorizer = CountVectorizer()  # add stop_words = 'english' to remove stop words
count_vector_dense = count_vectorizer.fit_transform(docs).todense()
temp_df = pd.DataFrame(count_vector_dense, columns=count_vectorizer.get_feature_names_out())


In [25]:
term_freq = pd.DataFrame({"term": temp_df.columns.values, "freq" : temp_df.sum(axis=0)})

In [26]:
term_freq.freq.sum()

2335467

In [27]:
docs.apply(lambda x: len(x.split())).sum()

2351777

notice the difference - perhaps due to certain processing details such as handling special characters, 

In [28]:
t1.split()

['In',
 'China',
 'today,President',
 'Xi',
 'Jinping’s',
 'new',
 'grand',
 'narrative',
 'is',
 'framed',
 'by',
 'the',
 'widely',
 'publicised',
 '‘four',
 'comprehensives’',
 '(四个全面,',
 '‘sigequanmian’).',
 'This',
 'narrative',
 'aims',
 'to',
 ':',
 '1.build',
 'a',
 'moderately',
 'prosperous',
 'society,',
 '2.',
 'deepen',
 'reform,',
 '3.',
 'govern',
 'the',
 'nation',
 'according',
 'to',
 'law,',
 'and',
 '4.',
 'tighten',
 'Party',
 'discipline.',
 'It',
 'is',
 'essentially',
 'a',
 'political',
 'narrative',
 'that',
 'tells',
 'a',
 'moral',
 'tale',
 'that',
 'legitimizes',
 'and',
 'glorifies',
 'the',
 'virtues',
 'of',
 'the',
 'present.',
 'It',
 'also',
 'attempts',
 'to',
 'shrug',
 'off',
 'mistakes',
 'of',
 'the',
 'previous',
 'dynasty.',
 'Drawing',
 'on',
 'the',
 'legal',
 'disciplines',
 'of',
 'economic',
 'law,',
 'international',
 'comparative',
 'law',
 'and',
 'the',
 'a',
 'priori',
 'analytic',
 'method',
 'of',
 'legal',
 'narrative',
 'analysis

In [29]:
set(count_vectorizer.vocabulary_.keys())

{'systematized',
 'regis2',
 'abb',
 'decided',
 'berglund',
 'synchronisation',
 'categorise',
 'alerted',
 'hilber',
 'robotization',
 'portrayals',
 'interconnecting',
 'fundamentality',
 'eligibility',
 'vdab',
 'potenza',
 'pessimistic',
 'inacio',
 'juglans',
 'coherent2',
 'immoderate',
 'tory',
 'laced',
 'cabg',
 'hierarchy',
 'yeates',
 'steensen',
 'traveling',
 'micro',
 'logistically',
 'techies',
 'fonn',
 'chuvash',
 'iccs',
 'cozzi',
 'topography',
 'armvodkanal',
 'erwin',
 'ilibraiy',
 'legitimate',
 'optimized',
 'storm',
 'conferring',
 'diagnose',
 '195',
 'upshot',
 'surfers',
 'categorizes',
 'beneath',
 'slaughtered',
 'livelihoods',
 'himself',
 'signalized',
 'statcubc',
 'noncombat',
 'geophysical',
 'tactics',
 'veracruz',
 'ho',
 'despair',
 'synthesise',
 'throughout',
 'mlaka',
 'primaiy',
 'ambassadors',
 'era',
 'prepares',
 'atca',
 'stressful',
 'algal',
 'slaughtering',
 'mfa',
 'autre',
 'recreationists',
 '505',
 'shallowness',
 'rothchild',
 'mili

In [30]:
term_freq.loc['sea']

term    sea
freq    387
Name: sea, dtype: object

In [31]:
term_freq.sort_values(by="freq", ascending=False)

Unnamed: 0,term,freq
the,the,143100
of,of,95834
and,and,93357
in,in,67152
to,to,64701
...,...,...
cheering,cheering,1
chekani,chekani,1
microscopes,microscopes,1
microscope,microscope,1


In [32]:
count_vectorizer.vocabulary_.get(u'women')


45152

In [33]:
count_vectorizer.get_feature_names_out()[count_vectorizer.get_feature_names_out() == "from"]

array(['from'], dtype=object)

In [34]:
np.count_nonzero(count_vector.toarray())


1647396

In [35]:
# proportion of non-zeros
np.count_nonzero(count_vector.toarray())/(count_vector.shape[0]*count_vector.shape[1])

0.0014600552559492562

In [36]:
print('vector shape: ', count_vector.shape)
print('vocabulary size: ' , len(count_vectorizer.vocabulary_))

vector shape:  (24669, 45738)
vocabulary size:  45738


In [37]:
# load in all the modules we're going to need
import nltk, re, string, collections
from nltk.util import ngrams # function for making ngrams
#doc = text_df.iloc[0].text
docs = text_df.text


In [38]:
count_vectorizer.vocabulary_

{'from': 17320,
 'gender': 17817,
 'perspective': 31409,
 'paulgaard': 30974,
 'points': 32035,
 'out': 30158,
 'that': 41221,
 'the': 41229,
 'labour': 23893,
 'markets': 25859,
 'of': 29487,
 'fishing': 16605,
 'villages': 44151,
 'have': 19212,
 'been': 5401,
 'highly': 19611,
 'segregated': 37317,
 'in': 20921,
 'terms': 41095,
 'existence': 15618,
 'male': 25572,
 'jobs': 22833,
 'and': 3469,
 'female': 16283,
 'however': 20021,
 'new': 28504,
 'business': 6873,
 'opportunities': 29841,
 'led': 24333,
 'to': 41630,
 'population': 32228,
 'peripheral': 31312,
 'areas': 4044,
 'now': 29057,
 'working': 45203,
 'service': 37557,
 'industry': 21264,
 'former': 17023,
 'boys': 6375,
 'girls': 18111,
 'are': 4040,
 'doing': 13104,
 'same': 36665,
 'indicates': 21161,
 'change': 7938,
 'because': 5370,
 'traditional': 41927,
 'boundaries': 6336,
 'between': 5660,
 'women': 45152,
 'men': 26434,
 'work': 45189,
 'being': 5451,
 'crossed': 10661,
 'but': 6886,
 'fact': 15924,
 'young': 455

In [39]:
count_vector

<24669x45738 sparse matrix of type '<class 'numpy.int64'>'
	with 1647396 stored elements in Compressed Sparse Row format>

In [40]:
count_vectorizer.get_stop_words()

In [41]:
count_vectorizer.get_feature_names_out()

array(['00', '000', '0000002', ..., 'œopen', 'ʿadawiyya', '四个全面'],
      dtype=object)

In [42]:
import re

In [43]:
t1

'In China today,President Xi Jinping’s new grand narrative is framed by the widely publicised ‘four comprehensives’ (四个全面, ‘sigequanmian’). This narrative aims to : 1.build a moderately prosperous society, 2. deepen reform, 3. govern the nation according to law, and 4. tighten Party discipline. It is essentially a political narrative that tells a moral tale that legitimizes and glorifies the virtues of the present. It also attempts to shrug off mistakes of the previous dynasty. Drawing on the legal disciplines of economic law, international comparative law and the a priori analytic method of legal narrative analysis, this paper provides a critical appraisal of the ‘four comprehensives’, paying special attention to how the four principal strands of the narrative shape the directions of China’s socialist rule of law reforms and governance.  Key words: Chinese law reform, four comprehensives, 四个全面, sigequanmian,socialist rule of law,legal narrative analysis, economic-law nexus, South Chin

In [44]:
print(re.search(r'China', t1))
print(re.search(r'[a-z]\,',t1))


<re.Match object; span=(3, 8), match='China'>
<re.Match object; span=(13, 15), match='y,'>


In [45]:
from nltk import word_tokenize, sent_tokenize
docs_str = docs.str.cat()
tokens = word_tokenize(docs_str)
len(tokens), len(set(tokens))

(2628532, 83735)

In [46]:
from nltk import collocations
fd = collocations.FreqDist(tokens)
fd.most_common(30)

[(',', 130689),
 ('the', 126635),
 ('of', 95441),
 ('and', 92697),
 ('.', 69566),
 ('to', 63693),
 ('in', 59083),
 ('a', 33126),
 ('for', 27726),
 ('(', 26075),
 (')', 25135),
 ('is', 25113),
 ('that', 20236),
 ('are', 18153),
 ('on', 17660),
 ('as', 17215),
 ('with', 13771),
 ('by', 13633),
 ('be', 12592),
 ('The', 12564),
 ('from', 9589),
 ('have', 9048),
 ('or', 8979),
 ('has', 7529),
 ('not', 7481),
 ('this', 7474),
 ('an', 7415),
 ('countries', 7244),
 ('their', 7212),
 ('’', 7012)]

In [47]:
measures = collocations.BigramAssocMeasures()
c = collocations.BigramCollocationFinder.from_words(tokens)
c.nbest(measures.pmi,100)

[("'Camikavas", 'negi'),
 ("'Necessary", 'Thickening'),
 ("'PES-like", "fashion'and"),
 ("'Paris", "rulebook'.This"),
 ("'any", 'electrically'),
 ("'ex", 'turpi'),
 ("'family", "values'.To"),
 ("'nine", 'gems'),
 ('-16.58', 'p.p.'),
 ('-Midia', 'Navodari'),
 ('-at', 'times-also'),
 ('-chronically-ill', 'people-'),
 ('-red', 'ressing'),
 ('.95.5', '95.3'),
 ('.Case', 'Closed'),
 ('.Definiciones', 'Instrumentos'),
 ('.Minimum', 'wages—which'),
 ('//climatepolicyinitiative.org/', 'wp-content/uploads/2015/05/150512'),
 ('//data.uJorldbank.org/', 'about/country-classifications'),
 ('//documents.worldbank.org/curated/en/552891468229171088/Cambodia-Nutrition-at-a-g',
  'lance'),
 ('//doi.org/10.1080/', '07900627.2017.1401919'),
 ('//dx.doi.org/10.1787/', 'sti_scoreboard-2013-en'),
 ('//ec.euroDa.cu/soc', 'ial/BlobServlcl'),
 ('//economictimes.indiatimes.com/',
  'news/politics-and-nation/indias-renewable-energy-targets-catch-the-attention-of-global-investors-still-need-ground-work/articleshow

In [48]:
measures

<nltk.metrics.association.BigramAssocMeasures at 0x179605050>

In [49]:
# http://wordnetweb.princeton.edu/perl/webwn?s=good&sub=Search+WordNet&o2=&o0=1&o8=1&o1=1&o7=&o5=&o9=&o6=&o3=&o4=&h=

from nltk.corpus import wordnet

In [51]:
wordnet.synsets('dig'),wordnet.synsets('dig', pos=wordnet.VERB)


([Synset('dig.n.01'),
  Synset('shot.n.09'),
  Synset('dig.n.03'),
  Synset('excavation.n.01'),
  Synset('dig.n.05'),
  Synset('dig.v.01'),
  Synset('dig.v.02'),
  Synset('labor.v.02'),
  Synset('dig.v.04'),
  Synset('dig.v.05'),
  Synset('excavate.v.04'),
  Synset('jab.v.01'),
  Synset('grok.v.01')],
 [Synset('dig.v.01'),
  Synset('dig.v.02'),
  Synset('labor.v.02'),
  Synset('dig.v.04'),
  Synset('dig.v.05'),
  Synset('excavate.v.04'),
  Synset('jab.v.01'),
  Synset('grok.v.01')])

In [56]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print(set(synonyms))
print(set(antonyms))

{'upright', 'ripe', 'skilful', 'dear', 'right', 'well', 'soundly', 'effective', 'near', 'in_effect', 'goodness', 'sound', 'full', 'serious', 'good', 'secure', 'practiced', 'unspoiled', 'thoroughly', 'proficient', 'estimable', 'salutary', 'adept', 'commodity', 'undecomposed', 'expert', 'safe', 'honorable', 'respectable', 'honest', 'in_force', 'dependable', 'unspoilt', 'just', 'trade_good', 'skillful', 'beneficial'}
{'ill', 'bad', 'evil', 'badness', 'evilness'}


In [53]:
syns = wordnet.synsets("good")

In [54]:
syns[0].examples()

['for your own good', "what's the good of worrying?"]

In [55]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('boat.n.01')
w3 = wordnet.synset('car.n.01')
w4 = wordnet.synset('cat.n.01')
print(w1.wup_similarity(w2)), print(w1.wup_similarity(w3)),print(w1.wup_similarity(w4)),print(w2.wup_similarity(w3)), print(w2.wup_similarity(w4)), print(w3.wup_similarity(w4))

0.9090909090909091
0.6956521739130435
0.32
0.6956521739130435
0.32
0.32


(None, None, None, None, None, None)