In [21]:
import pandas as pd
import numpy as np
import re

In [2]:
geog = pd.read_csv('geography_research_areas.csv', engine = 'python')
geog.head()

Unnamed: 0,Relevant Research Area
0,how land-use changes affect local climates wit...
1,examination of historical linkages between Sou...
2,"urban, social-cultural and tourism geographies"
3,general theme of geographic information scienc...
4,Transnational migration and citizenship\n'Dias...


In [4]:
# Preprocess text input
# remove random spacing/next line char
geog['Relevant Research Area'] = geog['Relevant Research Area'].apply(lambda x: ' '.join(x.split()))
# convert all to lower case
geog['Relevant Research Area'] = geog['Relevant Research Area'].str.lower()
# remove all numbers and non-word, non-space characters
geog['Relevant Research Area'] = geog['Relevant Research Area'].str.replace('[^a-zA-Z\s]+', '')
geog.head(10)
# remove stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = stopwords.words('english')
research_areas = list(geog['Relevant Research Area'].apply(lambda x: [item for item in x.split() if item not in stop]))

[nltk_data] Downloading package stopwords to /Users/uyen/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### LDA

In [7]:
from gensim.corpora import Dictionary
from gensim.models import ldamodel
from gensim.matutils import kullback_leibler, jaccard, hellinger, sparse2full
from gensim import similarities

dictionary = Dictionary(research_areas)
corpus = [dictionary.doc2bow(area) for area in research_areas]

np.random.seed(124) # setting random seed to get the same results each time.
model_lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=30)
model_lda.show_topics()

[(21,
  '0.034*"migration" + 0.023*"change" + 0.020*"economic" + 0.019*"north" + 0.018*"climate" + 0.013*"global" + 0.013*"labour" + 0.013*"geography" + 0.013*"asia" + 0.013*"africa"'),
 (23,
  '0.027*"change" + 0.019*"migration" + 0.019*"citizenship" + 0.019*"health" + 0.019*"models" + 0.018*"climate" + 0.018*"global" + 0.018*"carbon" + 0.010*"international" + 0.010*"nature"'),
 (28,
  '0.019*"surface" + 0.019*"coastal" + 0.018*"geography" + 0.016*"particular" + 0.016*"including" + 0.016*"environmental" + 0.014*"sensing" + 0.014*"reference" + 0.013*"estuarine" + 0.013*"models"'),
 (5,
  '0.035*"development" + 0.016*"particular" + 0.016*"national" + 0.016*"focus" + 0.016*"resources" + 0.013*"also" + 0.013*"freshwater" + 0.012*"wealth" + 0.011*"victorian" + 0.011*"london"'),
 (15,
  '0.024*"carbon" + 0.016*"understand" + 0.016*"anthropogenic" + 0.016*"innovation" + 0.016*"quantify" + 0.016*"forest" + 0.011*"research" + 0.010*"reconstruction" + 0.008*"change" + 0.008*"urban"'),
 (1,
  '0

In [27]:
new_area = "Research interests are grouped around the development and application of geographic information science and geographic information systems, latterly using Big Data analytics. Socioeconomic applications include: geo-temporal demographics; retail analysis; public service delivery (specifically health, education and policing); Internet GIS applications and e-social science; housing and retail market analysis; fractal analysis of cities; and social survey research practice. Contributions to the research literature have been funded by numerous research grants, including knowledge transfer/exchange funding."
# preprocess and remove stop words
new_area = re.sub('[^a-zA-Z\s]+', '', new_area).lower()
new_area = [item for item in new_area.split() if item not in stop]

In [28]:
new_area = dictionary.doc2bow(new_area)
# convert to lda space
new_area_lda = model_lda[new_area]
new_area_lda

[(0, 0.032229105300430202),
 (4, 0.058405600578926346),
 (7, 0.50577681128204355),
 (17, 0.046879199348016279),
 (20, 0.094047590953170612),
 (22, 0.048517586133021033),
 (24, 0.050602716020406446),
 (26, 0.14565521152217983)]

In [54]:
# compare with the other documents in the corpus
index = similarities.MatrixSimilarity(model_lda[corpus])

# print similarities (cosine similarity)
sim = index[new_area_lda]
print("Similar document indexes:")
for i, score in enumerate(sim):
    if score > 0:
        print(i, score)

Similar document indexes:
3 0.0859579
7 0.172446
8 0.927395
16 0.0927854
18 0.088962
20 0.267074
25 0.0927854
30 0.856836
31 0.927395
33 0.927395
37 0.0294071
38 0.0590954
39 0.0927854
40 0.0590954
41 0.172446
43 0.088962
45 0.107093
48 0.927395
56 0.927395
58 0.927395
59 0.938064
60 0.0859579
61 0.0590954
63 0.938064
64 0.0927854
65 0.0859579
66 0.0207918
67 0.267074
69 0.0927854
72 0.267074
73 0.172446
75 0.927395
77 0.927395
80 0.0101826
82 0.107093
83 0.107093
86 0.0859579
87 0.0590954
90 0.927395
91 0.0859579
92 0.0590954
93 0.267074
94 0.107093
95 0.107093
97 0.0590954
103 0.267074
104 0.088962
106 0.088962
112 0.172446
117 0.927395
119 0.088962
121 0.172446
123 0.172446
124 0.0859579
127 0.0590954
133 0.172446
136 0.0859579
137 0.0859579
144 0.088962
145 0.107093
146 0.107093
147 0.172446
148 0.0590954


In [None]:
from gensim.models import Word2Vec
model_wv = Word2Vec(research_areas, size=100, window=5, min_count=5, workers=4)

In [None]:
economy = model_wv.wv['economy']
political = model_wv.wv['political']

In [None]:
economy + political
['abc'] = ___

[economy]
[geography]