In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
from matplotlib import pyplot as plt
import matplotlib as mpl
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from langdetect import detect

In [2]:
# https://matplotlib.org/stable/gallery/text_labels_and_annotations/fonts_demo.html
# font for displaying confusion matrix
font = {'family': 'sans-serif',
        #'color':  'darkred',
        'weight': 'heavy',
        'size': 6,
        }

### get data ready
 - read data
 - format the columns and column names 
 - add language detection
 - filter by agreement and language

 - language detection takes long time, on the full SDG dataset, it takes neara 3 minutes
 - may not want to run it unless really needed, or run it after filtering by other dimensions

In [3]:
text_df = pd.read_csv("/Users/yingli/Downloads/osdg-community-data-v2023-01-01.csv", sep = "\t", quotechar='"')
col_names = text_df.columns.values[0].split('\t')
text_df[col_names] = text_df[text_df.columns.values[0]].apply(lambda x: pd.Series(str(x).split("\t")))
text_df.drop(text_df.columns.values[0],axis = 1, inplace=True)
text_df = text_df.astype({'sdg':int, 'labels_negative': int, 'labels_positive':int, 'agreement': float}, copy=True)
text_df = text_df.query("agreement > 0.5 and (labels_positive - labels_negative) > 2")
#text_df["lang"] = text_df["text"].apply(lambda x: detect(x))
#text_df = text_df.query("lang == 'en'")# language detection cost extra time, do it after other filterings to minimize cost
text_df.reset_index(inplace=True)

In [22]:
text_df.head()

Unnamed: 0,index,doi,text_id,text,sdg,labels_negative,labels_positive,agreement
0,0,10.6027/9789289342698-7-en,00021941702cd84171ff33962197ca1f,"""From a gender perspective, Paulgaard points o...",5,1,8,0.777778
1,2,10.1787/9789264289062-4-en,0004eb64f96e1620cd852603d9cbe4d4,The average figure also masks large difference...,3,1,8,0.777778
2,5,10.1787/5js4xfgl4ks0-en,000b54717f2deea5d99055b4c1c2bf5a,These findings are consistent with previous wo...,10,2,7,0.555556
3,7,10.1787/9789264117563-8-en,000bfb17e9f3a00d4515ab59c5c487e7,The Israel Oceanographic and Limnological Rese...,6,0,3,1.0
4,8,10.18356/805b1ae4-en,001180f5dd9a821e651ed51e30d0cf8c,Previous chapters have discussed ways to make ...,2,0,3,1.0


In [27]:
text_df.shape

(24669, 8)

In [26]:
len(text_df.text[0].split())

163

In [29]:
docs = text_df.text
count_vectorizer = CountVectorizer() # default is unigram, no stop word removal
count_vectorizer.fit(docs) 
len(count_vectorizer.vocabulary_)

45738

In [30]:
count_vectorizer.vocabulary_

{'from': 17320,
 'gender': 17817,
 'perspective': 31409,
 'paulgaard': 30974,
 'points': 32035,
 'out': 30158,
 'that': 41221,
 'the': 41229,
 'labour': 23893,
 'markets': 25859,
 'of': 29487,
 'fishing': 16605,
 'villages': 44151,
 'have': 19212,
 'been': 5401,
 'highly': 19611,
 'segregated': 37317,
 'in': 20921,
 'terms': 41095,
 'existence': 15618,
 'male': 25572,
 'jobs': 22833,
 'and': 3469,
 'female': 16283,
 'however': 20021,
 'new': 28504,
 'business': 6873,
 'opportunities': 29841,
 'led': 24333,
 'to': 41630,
 'population': 32228,
 'peripheral': 31312,
 'areas': 4044,
 'now': 29057,
 'working': 45203,
 'service': 37557,
 'industry': 21264,
 'former': 17023,
 'boys': 6375,
 'girls': 18111,
 'are': 4040,
 'doing': 13104,
 'same': 36665,
 'indicates': 21161,
 'change': 7938,
 'because': 5370,
 'traditional': 41927,
 'boundaries': 6336,
 'between': 5660,
 'women': 45152,
 'men': 26434,
 'work': 45189,
 'being': 5451,
 'crossed': 10661,
 'but': 6886,
 'fact': 15924,
 'young': 455

In [31]:
docs = text_df.text
count_vectorizer = CountVectorizer(ngram_range=(1,1),stop_words='english') 
count_vectorizer.fit(docs) 
len(count_vectorizer.vocabulary_)

45440

In [32]:
print(count_vectorizer.get_stop_words())

frozenset({'your', 'alone', 'why', 'something', 'then', 'please', 'up', 'whose', 'describe', 'interest', 'mine', 'side', 'thence', 'they', 'either', 'its', 'and', 'serious', 'beyond', 'although', 'has', 'of', 'also', 'among', 'almost', 'amongst', 'than', 'ours', 'thereafter', 'because', 'etc', 'else', 'nine', 'former', 'never', 'this', 'con', 'at', 'several', 'around', 'afterwards', 'hence', 'namely', 'sometimes', 'into', 'do', 'whatever', 'amount', 'however', 'them', 'otherwise', 'bottom', 'been', 'last', 'yourself', 'call', 'thereupon', 'or', 'may', 'were', 'could', 'one', 'as', 'there', 'with', 'mill', 'on', 'ourselves', 'forty', 'de', 'cry', 'bill', 'couldnt', 'which', 'nowhere', 'five', 'some', 'show', 'to', 'fire', 'part', 'most', 'when', 'co', 'her', 'after', 'during', 'seem', 'whither', 'done', 'herein', 'much', 'since', 'the', 'he', 'still', 'his', 'we', 'from', 'be', 'already', 'empty', 'keep', 'a', 're', 'everyone', 'not', 'onto', 'well', 'hereafter', 'becomes', 'out', 'enou

 - difference is not the same as the stop_words length, why?
 - perhaps some stop words never appeared in our corpus

In [33]:
45738 - 45440, len(count_vectorizer.get_stop_words())


(298, 318)

In [34]:
count_vectorizer.vocabulary_.keys()



 - take a look at a portion of the term-document matrix

In [9]:
docs = text_df.text
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer.fit(docs)
count_vector = count_vectorizer.transform(docs).toarray() 
count_vector_df = pd.DataFrame(count_vector, columns=count_vectorizer.get_feature_names_out())
term_freq = pd.DataFrame({"term": count_vector_df.columns.values, "freq" : count_vector_df.sum(axis=0)})
count_vector_df.loc[100:125,term_freq.sort_values(by="freq", ascending =False)[:20].term] # take a portion

Unnamed: 0,countries,women,development,health,water,public,social,education,policy,international,law,energy,national,rights,oecd,economic,use,new,level,income
100,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
101,1,0,0,0,0,2,0,0,0,1,0,0,1,0,0,0,0,0,1,0
102,2,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0
103,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
104,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
105,3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2
106,1,3,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,2
107,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
108,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0
109,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0


In [10]:
term_freq.sort_values(by="freq", ascending=False)

Unnamed: 0,term,freq
countries,countries,7761
women,women,5984
development,development,5312
health,health,4685
water,water,4664
...,...,...
envisioning,envisioning,1
envolee,envolee,1
ownerless,ownerless,1
envsec,envsec,1


In [11]:
term_freq.freq.sum()

1351051

In [12]:
docs.apply(lambda x: len(x.split())).sum()

2351777

 - reduced amount of word occurances by 40%

 - check the bigrams

In [35]:
docs = text_df.text
count_vectorizer = CountVectorizer(ngram_range=(2,2), stop_words='english') 
count_vectorizer.fit(docs) 
len(count_vectorizer.vocabulary_)

834465

 - check the tri-grams

In [36]:
docs = text_df.text
count_vectorizer = CountVectorizer(ngram_range=(3,3), stop_words='english') 
count_vectorizer.fit(docs) 
len(count_vectorizer.vocabulary_)

1214215

In [15]:
# proportion of non-zeros
print('vocabulary size: ' , len(count_vectorizer.vocabulary_))
print('vector shape: ', count_vector.shape)
print("proportion of non-zeros: ", np.count_nonzero(count_vector)/(count_vector.shape[0]*count_vector.shape[1]))

vocabulary size:  1214215
vector shape:  (24669, 45440)
proportion of non-zeros:  0.0010100642720892218


In [16]:
count_vectorizer = CountVectorizer(ngram_range = (2,2),stop_words='english', min_df=5)
count_vector = count_vectorizer.fit_transform(docs).toarray()
count_vector_df = pd.DataFrame(count_vector, columns=count_vectorizer.get_feature_names_out())

# this below will not finish because it takes lots of resources
# need to use min_df = 2 and above in order for this to run
bigram_freq = pd.DataFrame({"term": count_vector_df.columns.values, "freq" : count_vector_df.sum(axis=0)})

In [17]:
print(bigram_freq.query("term == 'rural areas'")["freq"])
print("vocabulary index is: ", count_vectorizer.vocabulary_.get("rural areas"))

rural areas    411
Name: freq, dtype: int64
vocabulary index is:  21012


### get count vector and then followed by sum is an expensive way to get word frequency
 - may be better rely on vocabulary, which is a dictionary

In [18]:
bigram_freq.sort_values(by="freq", ascending =False)[:30]

Unnamed: 0,term,freq
human rights,human rights,1981
climate change,climate change,1301
et al,et al,1167
oecd countries,oecd countries,898
health care,health care,881
international law,international law,782
united states,united states,778
developing countries,developing countries,772
long term,long term,743
labour market,labour market,720


In [19]:
count_vectorizer.vocabulary_

{'gender perspective': 9554,
 'labour markets': 13135,
 'gender segregated': 9563,
 'new business': 15837,
 'business opportunities': 2945,
 'peripheral areas': 17174,
 'boys girls': 2786,
 'women men': 25529,
 'men work': 14961,
 'young people': 25906,
 'people working': 17080,
 'young adults': 25899,
 'described earlier': 5848,
 'spend time': 22540,
 'average figure': 2262,
 'large differences': 13266,
 'parts population': 16900,
 'likely limited': 13885,
 'limited access': 13914,
 'access primary': 821,
 'primary care': 18313,
 'care addition': 3104,
 'addition poor': 1098,
 'findings consistent': 8923,
 'previous work': 18258,
 'differences wage': 6196,
 'wage inequality': 25070,
 'inequality countries': 12162,
 'fournier koske': 9267,
 'koske 2012': 13113,
 'returns education': 20637,
 'education important': 6995,
 'important role': 11377,
 'earnings inequality': 6716,
 '2014 countries': 431,
 'supply demand': 23184,
 'demand skills': 5742,
 'role played': 20966,
 'labour market':

In [38]:
bigram_freq.query("freq==5").nunique()


term    6038
freq       1
dtype: int64

In [21]:
count_vectorizer.vocabulary_[u'women'] # u'women' for unicode string 'women'



KeyError: 'women'

In [None]:
count_vectorizer.vocabulary_.get(u'women') # u'women' for unicode string 'women'


44154

### putting together as a function
 - enable stop words removal
 - enable unigram, bigram, tri-gram

In [None]:
def get_term_freq(docs, ngram_range = (1, 1), stop_words = None, min_df = 2):
    count_vectorizer = CountVectorizer(ngram_range = ngram_range, stop_words = stop_words, min_df =min_df)
    count_vector = count_vectorizer.fit_transform(docs).toarray()# generate a matrix
    count_vector_df = pd.DataFrame(count_vector, columns=count_vectorizer.get_feature_names_out())
    term_freq = pd.DataFrame({"term": count_vector_df.columns.values, "freq" : count_vector_df.sum(axis=0)})
    return term_freq


In [None]:
docs = text_df.text
term_freq_1_1_stop = get_term_freq(docs, stop_words='english').sort_values(by="freq", ascending=False)
term_freq_1_1_no_stop = get_term_freq(docs).sort_values(by="freq", ascending=False)
term_freq_2_2_stop = get_term_freq(docs, ngram_range = (2,2), stop_words = 'english').sort_values(by="freq", ascending=False)

In [None]:
term_freq_2_2_stop.head(50)

Unnamed: 0,term,freq
countries,countries,7761
women,women,5984
development,development,5312
health,health,4685
water,water,4664
public,public,4591
social,social,4538
education,education,4535
policy,policy,4367
international,international,4360


In [None]:
count_vectorizer.vocabulary_.items()



In [None]:
count_vectorizer.get_stop_words()

In [None]:
count_vectorizer.get_feature_names_out()

array(['00', '000', '0000002', ..., 'œopen', 'ʿadawiyya', '四个全面'],
      dtype=object)

In [None]:
import re

In [None]:
t1 = text_df.text[1]

In [None]:
print(re.search(r'China', t1))
print(re.search(r'[a-z]\,',t1))


None
<re.Match object; span=(152, 154), match='u,'>


In [None]:
from nltk.corpus import wordnet

In [None]:
wordnet.synsets('dog'),wordnet.synsets('dog', pos=wordnet.VERB)


([Synset('dog.n.01'),
  Synset('frump.n.01'),
  Synset('dog.n.03'),
  Synset('cad.n.01'),
  Synset('frank.n.02'),
  Synset('pawl.n.01'),
  Synset('andiron.n.01'),
  Synset('chase.v.01')],
 [Synset('chase.v.01')])

In [None]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print(set(synonyms))
print(set(antonyms))

{'just', 'practiced', 'unspoiled', 'safe', 'proficient', 'estimable', 'well', 'full', 'respectable', 'near', 'in_force', 'serious', 'adept', 'commodity', 'soundly', 'good', 'skillful', 'expert', 'thoroughly', 'honorable', 'undecomposed', 'right', 'sound', 'effective', 'in_effect', 'beneficial', 'upright', 'salutary', 'honest', 'dear', 'secure', 'skilful', 'ripe', 'trade_good', 'dependable', 'unspoilt', 'goodness'}
{'ill', 'evilness', 'evil', 'bad', 'badness'}


In [None]:
syns = wordnet.synsets("good")

In [None]:
syns[0].examples()

['for your own good', "what's the good of worrying?"]

In [None]:
w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('boat.n.01')
w3 = wordnet.synset('car.n.01')
w4 = wordnet.synset('cat.n.01')
print(w1.wup_similarity(w2)), print(w1.wup_similarity(w3)),print(w1.wup_similarity(w4)),print(w2.wup_similarity(w3)), print(w2.wup_similarity(w4)), print(w3.wup_similarity(w4))

0.9090909090909091
0.6956521739130435
0.32
0.6956521739130435
0.32
0.32


(None, None, None, None, None, None)