# Text Classification and Analysis
Now we're at the point where we should be able to:
* Read in a collection of documents - a *corpus* which contains the manually coded and predicted papers
* Do some preliminary data analysis and select only papers >= 2006
* Load the Spacy language model
* Do some text processing and lemmatize abstract using the Spcy lemmatizer
* Use TFIDF to fit and tranform the vectorized texts
* Implement NMF for topic modelling
* Implement t-SNE to visualize the topics
* Implement geoparsing

## Perform imports and load the dataset
The seen dataset contains the bibliometric information of over 1600 publications which have been manually labelled by two authors. Inconsistencies have been discussed and agreed upon. 

In [None]:
from typing import Tuple
import copy as cp
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from wordcloud import WordCloud

In [None]:
### TODO Add your file path ###
file_path = '../../Data/ClimateEducation/'
#File that was extracted from the big database having only labelled data. Relevant = 1 means relevant.
file_name = 'output_relevant_unseen_seen_merge_shuffle_August2023.xlsx'
df = pd.read_excel(file_path + file_name)

df.head()

In [None]:
len(df)

In [None]:
df.rename(columns={'Unnamed: 0': 'Identifier'}, inplace = True)

In [None]:
df['PubYear'] = df['PubYear'].astype(int)

### Take a look at a typical abstract.

### Detect & remove empty strings
Technically, we're dealing with "whitespace only" strings. If the original .xlsx file had contained empty strings, pandas **.read_xlsx()** would have assigned NaN values to those cells by default.

In order to detect these strings we need to iterate over each row in the DataFrame. The **.itertuples()** pandas method is a good tool for this as it provides access to every field. For brevity we'll assign the names `i`, `lb` and `rv` to the `index`, `DOI` and `Title` columns, but actually we need to define over which columns we want to irerate and the code below needs to reflect this.

## Take a quick look at the `label` column:

In [None]:
ax = df['Relevant'].value_counts().plot.bar(figsize = (5,7), color=['orange', 'green'])
ax.set_xticklabels(['Irrelevant','Relevant'], rotation = 45, fontsize=12)
ax.set_ylabel("Number of papers", fontsize=12)
ax.set_title('Papers split',fontsize=14)
plt.savefig('../../Data/ClimateEducation/Figures_092024/irrelevant_relevant_Split.svg', dpi='figure',format = 'svg',
        bbox_inches='tight', pad_inches=0.1,
        facecolor='auto', edgecolor='auto',
        backend=None)

In [None]:
npr = df[df['Relevant'] == 1]

In [None]:
npr.reset_index(inplace = True)

In [None]:
plt.figure(figsize=(18,6))
subgrade_order = sorted(npr['PubYear'].dropna().astype(int).unique())
x1 = npr['PubYear'].dropna().astype(int)
chart = sns.countplot(x=x1,data=npr,order = subgrade_order, palette = 'coolwarm')
chart.set_xticklabels(chart.get_xticklabels(), rotation=90)

None
#npr['PubYear'].value_counts().plot(kind = 'bar', figsize = (12,6))

In [None]:
from matplotlib.ticker import FormatStrFormatter
plt.figure(figsize=(18,6))
subgrade_order = sorted(npr[npr['PubYear']>1960]['PubYear'].dropna().astype(int).unique())
x1 = npr['PubYear'].dropna().astype(int)
chart = sns.countplot(x=x1,data=npr,order = subgrade_order, palette = 'coolwarm')
chart.set_xticklabels(chart.get_xticklabels(), rotation=90, size =15)
chart.set_yticklabels(chart.get_yticks(), size = 15, )
chart.set_title("Publications per year", fontsize=20)
chart.yaxis.set_major_formatter(FormatStrFormatter('%.0f'))
chart.set_xlabel(xlabel = None)
chart.set_ylabel(ylabel = None)
plt.savefig('../../Data/ClimateEducation/Figures_092024/PubYear_All.eps', dpi='figure',format = 'eps',
        bbox_inches='tight', pad_inches=0.1,
        facecolor='auto', edgecolor='auto',
        backend=None)
None

In [None]:
#Given the low number of papers before 2008 we keep only from 2008 onwards
npr_2008 = npr[npr['PubYear'] >=2008]

In [None]:
len(npr[npr['PubYear'] >= 2008])

In [None]:
!python -m spacy validate

In [None]:
#Importa libraries for semantic analysis and other tasks
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
nlp = spacy.load("en_core_web_lg")

In [None]:
nlp.Defaults.stop_words |= {"climate","change","education", "climatic","changes","climat", "changing", "chang",
        "educ", "educational", "educative", "teach", "teaching","global", "warming", "die", 
                           "und", "das", "pro", "auf",'ll','ve', 'der', 'ein','sich', 'für'}

In [None]:
import requests
def britishize(string):
    url ="https://raw.githubusercontent.com/hyperreality/American-British-English-Translator/master/data/american_spellings.json"
    american_to_british_dict = requests.get(url).json()    

    for american_spelling, british_spelling in american_to_british_dict.items():
        string = string.replace(american_spelling, british_spelling)
  
    return string

In [None]:
#We want all abstracts in british english
#Need a good network to run this because of the url request in the britishize function. 
npr_2008['Abstract_british'] = npr_2008['Abstract'].apply(lambda x: britishize(x))

In [None]:
#Use the spacy lemmatizer to get the best result so far
npr_2008['abstract_lemmatized'] = npr_2008['Abstract_british'].apply(lambda row: " ".join([w.lemma_ for w in nlp(row)]))

In [None]:
stop_words_2 = nlp.Defaults.stop_words

In [None]:
#tfidf = TfidfVectorizer(max_df=0.95, min_df=2, ngram_range = (1,1), stop_words=stop_words_2)
#tfidf = TfidfVectorizer(max_df=0.95, min_df=2, ngram_range = (1,2), stop_words=stop_words_2)
#tfidf = TfidfVectorizer(max_df=0.95, min_df=2, ngram_range = (1,3), stop_words=stop_words_2)

In [None]:
#if ngram_22
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, ngram_range=(2,2), stop_words=list(stop_words_2))
dtm = tfidf.fit_transform(npr_2008['abstract_lemmatized'])
tfidf_weights_22 = [(word, dtm.getcol(idx).sum()) for word, idx in tfidf.vocabulary_.items()]
feature_names_22 = tfidf.get_feature_names_out()

In [None]:
#if ngram_12 - which is the one used throughout the topic modelling
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, ngram_range=(1,2), stop_words=list(stop_words_2))
dtm = tfidf.fit_transform(npr_2008['abstract_lemmatized'])
tfidf_weights_12 = [(word, dtm.getcol(idx).sum()) for word, idx in tfidf.vocabulary_.items()]
feature_names_12 = tfidf.get_feature_names_out()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FormatStrFormatter

# Create a single figure for the count plot and word clouds
fig = plt.figure(figsize=(12, 10))

# Add a subplot for the count plot (spanning the full width)
ax1 = fig.add_subplot(2, 1, 1)  # 2 rows, 1 column, first subplot
x1 = npr_2008['PubYear'].dropna().astype(int)
subgrade_order = sorted(npr_2008['PubYear'].dropna().astype(int).unique())
chart = sns.countplot(x=x1, data=npr_2008, order=subgrade_order, palette='coolwarm', ax=ax1)
chart.set_xticklabels(chart.get_xticklabels(), rotation=90, size=12)
chart.set_yticklabels(chart.get_yticks(), size=12)
chart.set_title("Publications per Year from 2008", fontsize=16)
chart.yaxis.set_major_formatter(FormatStrFormatter('%.0f'))
chart.set_xlabel(xlabel=None)
chart.set_ylabel('Count of Publications', fontsize=12)

# Add subplots for the word clouds (2 side by side)
ax2 = fig.add_subplot(2, 2, 3)  # 2 rows, 2 columns, first column of second row
ax2.imshow(w12, interpolation='bilinear')
#ax2.set_title('Word Cloud 1', fontsize=16)
ax2.axis('off')  # Hide axis

ax3 = fig.add_subplot(2, 2, 4)  # 2 rows, 2 columns, second column of second row
ax3.imshow(w22, interpolation='bilinear')
#ax3.set_title('Word Cloud 2', fontsize=16)
ax3.axis('off')  # Hide axis

# Adjust layout to look better
plt.tight_layout(pad=2)

# Save the figure as a file (e.g., PNG)
plt.savefig('../../Data/ClimateEducation/Figures_092024/combined_plot.svg', bbox_inches='tight')

# Show the combined figure
plt.show()

In [None]:
occ = np.asarray(dtm.sum(axis=0)).ravel().tolist()
counts_df = pd.DataFrame({'term': tfidf.get_feature_names_out(), 'occurrences': occ})
counts_df.sort_values(by='occurrences', ascending=False).head(10)

In [None]:
weights = np.asarray(dtm.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': tfidf.get_feature_names_out(), 'weight': weights})
weights_df = weights_df.sort_values(by='weight', ascending=False).head(200)
#weights_df.to_excel('/Users/vmuccion/Documents/Projects/ClimateEducation/Output_Data/term_weight_gram12.xlsx',index = False, header=True)
weights_df.head(10)

In [None]:
import string
import re
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
import nltk

# Contraction map
c_dict = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "I would",
    "i'd've": "I would have",
    "i'll": "I will",
    "i'll've": "I will have",
    "i'm": "I am",
    "i've": "I have",
    "isn't": "is not",
    "it'd": "it had",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there had",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we had",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'alls": "you alls",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you had",
    "you'd've": "you would have",
    "you'll": "you you will",
    "you'll've": "you you will have",
    "you're": "you are",
    "you've": "you have"
}

# Compiling the contraction dict
c_re = re.compile('(%s)' % '|'.join(c_dict.keys()))

# List of stop words
#add_stop = ['climate', 'change', 'education', 'science']
#stop_words = ENGLISH_STOP_WORDS.union(add_stop)
#nlp.Defaults.stop_words |= {"climate","change","education", "climatic","changes","climat", "changing", "chang",
#                           "educ", "educational", "educative", "teach", "teaching"}

stop_words = nlp.Defaults.stop_words

#nlp.Defaults.stop_words.add('climate','change','education')


# List of punctuation
punc = list(set(string.punctuation))


# Splits words on white spaces (leaves contractions intact) and splits out
# trailing punctuation
def casual_tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens


def expandContractions(text, c_re=c_re):
    def replace(match):
        return c_dict[match.group(0)]
    return c_re.sub(replace, text)


def process_text(text):
    text = casual_tokenizer(text)
    text = [each.lower() for each in text]
    text = [re.sub('[0-9]+', '', each) for each in text]
    text = [expandContractions(each, c_re=c_re) for each in text]
    text = [SnowballStemmer('english').stem(each) for each in text]
    text = [w for w in text if w not in punc]
    text = [w for w in text if w not in stop_words]
    text = [each for each in text if len(each) > 1]
    text = [each for each in text if ' ' not in each]
    return text

def top_words(topic, n_top_words):
    return topic.argsort()[:-n_top_words - 1:-1]  


def topic_table(model, feature_names, n_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        t = (topic_idx)
        topics[t] = [feature_names[i] for i in top_words(topic, n_top_words)]
    return pd.DataFrame(topics)


def whitespace_tokenizer(text): 
    pattern = r"(?u)\b\w\w+\b" 
    tokenizer_regex = RegexpTokenizer(pattern)
    tokens = tokenizer_regex.tokenize(text)
    return tokens


# Funtion to remove duplicate words
def unique_words(text): 
    ulist = []
    [ulist.append(x) for x in text if x not in ulist]
    return ulist


def word_count(text):
    return len(str(text).split(' '))

In [None]:
# Process the text
npr_2008['processed_text'] = npr_2008['abstract_lemmatized'].apply(process_text)

In [None]:
npr_2008.reset_index(inplace = True)

In [None]:
npr_2008.drop('level_0', inplace = True, axis = 1)

In [None]:
npr_2008.drop('index', inplace = True, axis = 1)

In [None]:
from sklearn.decomposition import NMF

In [None]:
nmf_model = NMF(n_components=15,random_state=43)

In [None]:
nmf_model.fit(dtm)

In [None]:
topics_list = []
for index,topic in enumerate(nmf_model.components_):
    #print(f'THE TOP 10 WORDS FOR TOPIC #{index}')
   # print([tfidf.get_feature_names()[i] for i in topic.argsort()[-5:]])
   # print('\n')
    topics_list.append([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-10:]])

In [None]:
topics_df = pd.DataFrame(topics_list)
topics_df = topics_df.T
topics_df = topics_df.add_prefix('Topic_')
topics_df.iloc[::-1]

In [None]:
topics_df.iloc[::-1].to_excel("../../Data/ClimateEducation/ListOfTopics.xlsx")

In [None]:
#This are the suggested title for the topics above from 0 to len(topics)
topics = ['Science_Research','Medical_Health','Physical_Geography','Undefined_1','Sustainability_Sustainable',
          'Adaptation_Community', 'Energy_Mitigation','Environment_Behaviour','Undefined_2','Undefined_3',
          'Student_Learning','Disaster_Risk','Teacher_Science', 'Child_Young','Game_Gamification']

In [None]:
nc = len(nmf_model.components_)

In [None]:
topic_df3 = pd.DataFrame(
    nmf_model.transform(dtm), 
    columns=["topic_{}".format(i) for i in range(nc)]
).astype(float)
topic_df3.index = npr_2008.index
npr_topics = pd.concat([npr_2008, topic_df3], axis=1)

In [None]:
topic_results = nmf_model.transform(dtm)

In [None]:
topic_results.shape

In [None]:
# Transforming the document-term matrix
topic_results = nmf_model.transform(dtm)

# Topic mapping
topic_mapping = {
    0: 'Science_Research', 1: 'Medical_Health', 2: 'Physical_Geography',
    3: 'Undefined_1', 4: 'Sustainability_Sustainable',
    5: 'Adaptation_Community', 6: 'Energy_Mitigation',
    7: 'Environment_Behaviour', 8: 'Undefined_2', 9: 'Undefined_3',
    10: 'Student_Learning', 11: 'Disaster_Risk', 12: 'Teacher_Science',
    13: 'Child_Young', 14: 'Game_Gamification'
}

# Count the number of documents for each topic
doc_topic_counts = topic_results.argmax(axis=1)  # Get the most associated topic for each document
topic_member_counts = np.bincount(doc_topic_counts)  # Count occurrences per topic

# Create a DataFrame to map topics to their counts and labels
topic_counts_df = pd.DataFrame({
    'Topic': np.arange(len(topic_member_counts)),
    'Count': topic_member_counts
})

# Add the Topic Labels to the DataFrame
topic_counts_df['Topic Label'] = topic_counts_df['Topic'].map(topic_mapping)

# Filter out topics with fewer than 100 members and sort by count in descending order
valid_topics_df = topic_counts_df[topic_counts_df['Count'] >= 100].sort_values(by='Count', ascending=False)

# Function to get the top words for each topic
def get_top_words(model, feature_names, valid_topics, n_words=10):
    topic_words = {}
    for topic_idx in valid_topics:
        top_indices = model.components_[topic_idx].argsort()[-n_words:][::-1]  # Get top word indices
        topic_words[topic_idx] = [feature_names[i] for i in top_indices]  # Get top words
    return topic_words

# Get the top words for valid topics
top_words = get_top_words(nmf_model, tfidf.get_feature_names_out(), valid_topics_df['Topic'].values)

# Create a grid of bar plots for valid topics, ordered by document counts
n_topics = len(top_words)
n_words = 10  # Number of words per topic

# Determine the number of rows and columns for the plots
n_cols = 3  # Number of columns
n_rows = int(np.ceil(n_topics / n_cols))  # Calculate rows needed

# Generate a color palette with enough colors for all topics
colors = sns.color_palette("tab20", len(topic_mapping))  # Use a distinct color palette

# Create a color mapping for topics based on the palette
color_mapping = {idx: colors[i] for i, idx in enumerate(topic_mapping.keys())}

fig, axs = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
axs = axs.flatten()  # Flatten the array of axes for iteration

# Plotting each topic's top words ordered by document count
for i, (topic_idx, words) in enumerate(top_words.items()):
    # Check if topic is in valid topics DataFrame
    if topic_idx in valid_topics_df['Topic'].values:
        # Create a DataFrame for the top words and their relevance
        word_indices = np.arange(n_words)
        word_counts = nmf_model.components_[topic_idx, np.argsort(nmf_model.components_[topic_idx])[-n_words:][::-1]]

        # Create the bar plot with color from the mapping
        axs[i].barh(word_indices, word_counts, color=color_mapping[topic_idx])  # Use the consistent color for each topic
        axs[i].set_yticks(word_indices)
        axs[i].set_yticklabels(words)

        # Set title to include topic number and label
        topic_label = topic_mapping.get(topic_idx, "Unknown Topic")  # Fetch the label
        axs[i].set_title(f'Topic {topic_idx}: {topic_label}', fontsize=16)
        axs[i].set_xlabel('Importance', fontsize=14)

# Remove any empty subplots
for j in range(n_topics, n_rows * n_cols):
    fig.delaxes(axs[j])

plt.tight_layout()
#plt.savefig('../../Data/ClimateEducation/Figures_092024/Topics_words_importance.svg', format='svg', dpi='figure',
#        bbox_inches='tight', pad_inches=0.2,
#        facecolor='auto', edgecolor='auto',
#        backend=None)

plt.show()

In [None]:
topic_counts_df

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Assuming 'topic_results' is defined as your document-topic matrix

# Topic mapping
topic_mapping = {
    0: 'Science_Research', 1: 'Medical_Health', 2: 'Physical_Geography', 
    3: 'Undefined_1', 4: 'Sustainability_Sustainable', 
    5: 'Adaptation_Community', 6: 'Energy_Mitigation', 
    7: 'Environment_Behaviour', 8: 'Undefined_2', 9: 'Undefined_3',
    10: 'Student_Learning', 11: 'Disaster_Risk', 12: 'Teacher_Science',
    13: 'Child_Young', 14: 'Game_Gamification'
}

# Count number of documents for each topic
doc_topic_counts = np.bincount(topic_results.argmax(axis=1))

# Filter out topics with fewer than 100 members
valid_topics = np.where(doc_topic_counts >= 100)[0]

# Perform t-SNE on valid topic results
# We need to select only columns corresponding to valid topics
valid_topic_indices = np.isin(topic_results.argmax(axis=1), valid_topics)
tsne_model = TSNE(n_components=2, random_state=0, perplexity=30, verbose=1)
tsne_results = tsne_model.fit_transform(topic_results[valid_topic_indices])

# Create a DataFrame for t-SNE results
tsne_df = pd.DataFrame(data=tsne_results, columns=['Dim 1', 'Dim 2'])

# Re-map topics to valid ones based on the original topic results
tsne_df['Topic'] = topic_results[valid_topic_indices].argmax(axis=1)

# Map topics to labels
tsne_df['Topic Label'] = tsne_df['Topic'].map(topic_mapping)

In [None]:
year = npr_2008['PubYear']

In [None]:
tsne_df['year'] = year

In [None]:
tsne_df['Topic'].value_counts()

In [None]:
tsne_df

In [None]:
tsne_df.to_csv('../../Data/ClimateEducation/tsne_df_output.csv')

In [None]:
# Select the colors for each topic (ensure the order matches the topic mapping)
colors = sns.color_palette("tab20", len(topic_mapping))

# Create a dictionary to map topic labels to specific colors
color_mapping = {label: colors[i] for i, label in enumerate(topic_mapping.values())}

In [None]:
plt.figure(figsize=(12, 8))
scatter = sns.scatterplot(data=tsne_df, x='Dim 1', y='Dim 2', hue='Topic Label', palette=color_mapping, alpha=0.7)

# Draw straight lines indicating the labels
for topic in valid_topics:
    center = tsne_df[tsne_df['Topic'] == topic][['Dim 1', 'Dim 2']].median()  # Calculate center
    plt.text(center['Dim 1'], center['Dim 2'], topic_mapping[topic], 
             horizontalalignment='center', verticalalignment='center', fontsize=12, weight='bold')

plt.title('t-SNE Visualization of Topics', fontsize=18)

# Sorting the legend in alphabetical order based on the unique Topic Labels
handles, labels = scatter.get_legend_handles_labels()
sorted_indices = sorted(range(len(labels)), key=lambda i: labels[i])
handles = [handles[i] for i in sorted_indices]
labels = [labels[i] for i in sorted_indices]

# Create the legend with sorted labels
plt.legend(handles, labels, title='Topics', loc='upper center', bbox_to_anchor=(0.5, -0.08), ncol=4)

# Save the figure
plt.savefig('../../Data/ClimateEducation/Figures_092024/Climate_Education_tse.svg', format='svg', dpi='figure',
            bbox_inches='tight', pad_inches=0.2,
            facecolor='auto', edgecolor='auto',
            backend=None)

plt.show()

In [None]:
topic_trends = tsne_df[tsne_df['year'] <= 2022]

In [None]:
topic_trends.groupby('Topic Label').sum()

In [None]:
# Aggregating the data to count occurrences of each topic label per year
topic_trends = tsne_df.groupby(['year', 'Topic Label']).size().reset_index(name='count')

# Generate a color palette based on the number of unique topic labels
unique_labels = topic_trends['Topic Label'].unique()
num_labels = len(unique_labels)
palette = sns.color_palette("tab20", num_labels)  # Create a distinct color palette

# Now we can plot the trends over time
plt.figure(figsize=(12, 6))

# Using seaborn's lineplot to show the trends with topic labels as hue
sns.lineplot(data=topic_trends, x='year', y='count', hue='Topic Label', palette=color_mapping, marker='o')

plt.title('Topic Trends Over Time', fontsize=18)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Count of Documents', fontsize=14)
plt.legend(title='Topics', bbox_to_anchor=(0.03, 1), loc='upper left')
plt.xticks(rotation=45)  # Optional: Rotate x-axis labels for better readability
plt.tight_layout()
#plt.savefig('../../Data/ClimateEducation/Figures_092024/Topic_trends.eps', format ='eps', dpi='figure',
#        bbox_inches='tight', pad_inches=0.2,
#        facecolor='auto', edgecolor='auto',
#        backend=None)

In [None]:
# Assuming topic_trends is already defined as your DataFrame
# Group by 'Topic Label' and sum occurrences
topic_sums = topic_trends.groupby('Topic Label')['count'].sum().reset_index()

# Calculate the percentage of each topic
total_count = topic_sums['count'].sum()
topic_sums['percentage'] = (topic_sums['count'] / total_count) * 100

print(topic_sums)  # View the counts and percentages

In [None]:
topic_sums['percentage'].sort_values(ascending=True)

In [None]:
# Set the figure size
plt.figure(figsize=(12, 6))

topic_sums_sorted = topic_sums.sort_values(by='percentage', ascending=False)

# Create a bar plot to show the percentage
sns.barplot(data=topic_sums_sorted, x='percentage', y='Topic Label',palette=color_mapping)
# Adding titles and labels
plt.title('Percentage of Topics', fontsize=18)
plt.xlabel('Percentage of Total (%)', fontsize=14)
plt.ylabel('Topic Label', fontsize=14)

# Show the plot
plt.tight_layout()
#plt.savefig('../../Data/ClimateEducation/Figures_092024/Topic_percentage.svg', format ='svg', dpi='figure',
#        bbox_inches='tight', pad_inches=0.2,
#        facecolor='auto', edgecolor='auto',
#        backend=None)

In [None]:
# Create a figure with two subplots: one for the bar plot and one for the line plot
fig, ax1 = plt.subplots(2, 1, figsize=(12, 12))  # 2 rows, 1 column

# **Bar Plot** (First subplot)
#topic_sums_sorted = topic_sums.sort_values(by='percentage', ascending=False)
sns.barplot(data=topic_sums_sorted, x='percentage', y='Topic Label', palette=color_mapping, ax=ax1[0])
ax1[0].set_title('Percentage of Topics', fontsize=18)
ax1[0].set_xlabel('Percentage of Total (%)', fontsize=14)
ax1[0].set_ylabel('Topic Label', fontsize=14)
ax1[0].tick_params(axis='y', labelsize=12)
ax1[0].tick_params(axis='x', labelsize=12)

# **Line Plot** (Second subplot)
#topic_trends = tsne_df.groupby(['year', 'Topic Label']).size().reset_index(name='count')
#unique_labels = topic_trends['Topic Label'].unique()
#num_labels = len(unique_labels)
#color_mapping = sns.color_palette("tab20", num_labels)  # Create a distinct color palette

sns.lineplot(data=topic_trends, x='year', y='count', hue='Topic Label', palette=color_mapping, marker='o', ax=ax1[1])
ax1[1].set_title('Topic Trends Over Time', fontsize=18)
ax1[1].set_xlabel('Year', fontsize=14)
ax1[1].set_ylabel('Count of Documents', fontsize=14)
ax1[1].legend(title='Topics', bbox_to_anchor=(0.05, 1), loc='upper left')
ax1[1].tick_params(axis='x', rotation=45)  # Rotate x-axis labels for better readability
ax1[1].tick_params(axis='y', labelsize=12)
ax1[1].tick_params(axis='x', labelsize=12)

# Adjust layout to prevent overlap
plt.tight_layout()

# Save the combined figure
plt.savefig('../../Data/ClimateEducation/Figures_092024/Combined_Topic_Analysis.svg', 
            format='svg', dpi='figure',
            bbox_inches='tight', pad_inches=0.2,
            facecolor='auto', edgecolor='auto')

# Show the combined figure
plt.show()

In [None]:
# Generate a color palette
unique_labels = topic_sums['Topic Label'].unique()
palette = sns.color_palette("tab20", len(unique_labels))  # Use the same palette to maintain consistency

# Create a mapping of topic labels to colors
color_mapping = {label: palette[i] for i, label in enumerate(unique_labels)}

In [None]:
npr_topics

In [None]:
my_topic_dic = {0: 'Science_Research', 1:'Medical_Health', 2: 'Physical_Geography', 
                3: 'Undefined_1', 4: 'Sustainability_Sustainable', 
                5: 'Adaptation_Community', 6: 'Energy_Mitigation', 
                7: 'Environment_Behaviour', 8:'Undefined_2', 9:'Undefined_3',
                10: 'Student_Learning', 11:'Disaster_Risk', 12:'Teacher_Science',
                13: 'Child_Young', 14:'Game_Gamification'}

In [None]:
npr_topics['Topic'] = topic_results.argmax(axis=1)

In [None]:
npr_topics['Topic Label'] = npr_topics['Topic'].map(my_topic_dic)

In [None]:
npr_topics["Topic Label"].nunique()

In [None]:
npr_topics["Topic"].nunique()

In [None]:
npr_topics["Topic"].value_counts(normalize=True)*100

In [None]:
npr_topics["Topic"].value_counts()

In [None]:
topic_counts = npr_topics['Topic'].value_counts(normalize=True)*100
thresholds_2 = 1
npr_topics_select = npr_topics[npr_topics['Topic'].isin(topic_counts[topic_counts > thresholds_2].index)]

In [None]:
npr_topics.to_csv('../../Data/ClimateEducation/npr_topics.csv')

In [None]:
import matplotlib.colors as pltc
import matplotlib.ticker as mtick

In [None]:
npr_topics_select["Topic Label"].value_counts(normalize=True)*100

In [None]:
npr_topics["Topic"].value_counts(normalize=True)*100

In [None]:
ax = npr_topics_select.boxplot(column = ['topic_4', 'topic_10', 'topic_0', 'topic_5','topic_7', 'topic_13', 'topic_12', 
                           'topic_1','topic_6','topic_11','topic_14'])
ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, fontsize=10, ha='right', rotation_mode="anchor")

In [None]:
from scipy.stats import pearsonr
import matplotlib.pyplot as plt 

def corrfunc(x, y, hue = None, ax=None, **kws):
    """Plot the correlation coefficient in the top left hand corner of a plot."""
    r, _ = pearsonr(x, y)
    ax = ax or plt.gca()
    ax.annotate(f'ρ = {r:.2f}', xy=(.1, .9), xycoords=ax.transAxes)

In [None]:
npr_topic = npr_topics_select[['topic_10','topic_4','topic_5','topic_0','topic_7', 'topic_13', 
                      'topic_12', 'topic_1','topic_6','topic_11', 'topic_14']]

npr_topic.rename(columns={'topic_10': 'Student_Learning','topic_4': 'Sustainability_Sustainable',
                          'topic_0': 'Science_Research',
                          'topic_5': 'Adaptation_Community',
                          'topic_7': 'Environment_Behaviour',
                          'topic_13': 'Child_Young', 
                          'topic_12': 'Teacher_Science',
                          'topic_1': 'Health_Medical',
                          'topic_6': 'Energy_Mitigation',
                          'topic_11': 'Disaster_Risk',
                          'topic_14': 'Game_Gamification'
                         }, inplace = True)

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
ax = npr_topic.boxplot()
ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, fontsize=10, ha='right', rotation_mode="anchor")
plt.savefig('../../Data/ClimateEducation/Figures_092024/box_plots.svg',dpi='figure',
        bbox_inches='tight', pad_inches=0.2,
        facecolor='auto', edgecolor='auto',
        backend=None)

In [None]:
ax = sns.boxplot(data=npr_topic)
ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, fontsize=10, ha='right', rotation_mode="anchor")

In [None]:
plt.style.use('default')

g = sns.pairplot(
    npr_topic,
    x_vars=["Sytem_Learning_Social_Research", "Learning_School_Student", "School_Teacher_Science","Environmental_Concern_Behaviour_Child"],
    y_vars=["Sytem_Learning_Social_Research", "Learning_School_Student", "School_Teacher_Science","Environmental_Concern_Behaviour_Child"],
height=2, markers=".", diag_kind = 'hist')

#g.map_lower(corrfunc)
#plt.show()
plt.savefig('../Figure_GPT2_5/Climate_Education_pairplot.eps')

Something to try will be to make a heatmap with two topic and cross check when both of them exceed a certain threshold, e.g topic university and topic children...

In [None]:
npr_topic

In [None]:
aa1=npr_topic.corr(method='spearman')

In [None]:
aa1 = aa1.sort_index()

In [None]:
aa1 = aa1.sort_index(axis =1)

In [None]:
aa1

In [None]:
npr_topics['Source title']= npr_topics['Source title'].str.lower().replace('proceedings of the national academy of sciences of the united states of america','pnas')

In [None]:
npr_topics['Source title'] = npr_topics['Source title'].str.lower().replace('international journal of sustainability in higher education','Int Jour Sust Higher Edu')

In [None]:
npr_topics['Source title'] = npr_topics['Source title'].str.lower().replace('international journal of environmental research and public health','IJERPH')

In [None]:
npr_topics['Source title'] = npr_topics['Source title'].str.lower().replace('international research in geographical and environmental education','Int. Res. Geogr. Environ.')

In [None]:
a = npr_topics.groupby(by=npr_topics['Source title'].str.lower())['Times cited'].sum()

b= a/(a.sum())

In [None]:
a.sum()

In [None]:
c = (b*100).sort_values(ascending=False)

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=1, figsize =(10,15))
x0 = c.head(20)
x1 = (npr_topics['Source title'].str.lower().value_counts(normalize = True)*100).head(20)

x0.plot.barh(ax=axes[0],color ='dimgrey')
axes[0].set(ylabel=None)
axes[0].set(xlabel='Proportion of citations')
axes[0].set_xlabel('Proportion of citations', fontsize=14)
axes[0].set_yticklabels(axes[0].get_yticklabels(), fontsize=14)
axes[0].tick_params(axis='both', labelsize=14)
axes[0].xaxis.set_major_formatter(mtick.PercentFormatter(decimals=1))
#ax.set_ylim(10000, 90000)
#ax.set_ylim(10000, 900000)
axes[0].set_title(label = 'Top 20 journals based on citations',fontsize =16)

x1.plot.barh(ax=axes[1],color ='dimgrey')
axes[1].set(ylabel=None)
axes[1].set(xlabel='Proportion of publications')
axes[1].set_xlabel('Proportion of publications', fontsize=14)
axes[1].set_yticklabels(axes[1].get_yticklabels(), fontsize=14)
axes[1].xaxis.set_major_formatter(mtick.PercentFormatter(decimals=1))
axes[1].tick_params(axis='both', labelsize=14)
#ax.set_ylim(10000, 90000)
#ax.set_ylim(10000, 900000)
axes[1].set_title(label = 'Top 20 journals based on number of papers',fontsize =16)

plt.savefig('../Figure_GPT2_8/Journals_citations_publications.eps', format = 'eps', dpi='figure',
        bbox_inches='tight', pad_inches=0.1,
        facecolor='auto', edgecolor='auto',
        backend=None)

In [None]:
npr_2008['Source title'].str.lower().value_counts(normalize = True).head(20)

In [None]:
print("Do you want to save the datasets?")
x = input()
if x == 'yes':
    npr_topics.to_csv('/Users/vmuccion/Documents/Projects/ClimateEducation/Output_Data/npr_2008_GPT2_October2023.csv')
    npr_topics.to_excel('/Users/vmuccion/Documents/Projects/ClimateEducation/Output_Data/npr_2008_GPT2_October2023.xlsx',index = False, header=True)
#    npr.to_csv('/Users/vmuccion/Documents/Projects/ClimateEducation/Data/npr.csv')
#    npr.to_excel('/Users/vmuccion/Documents/Projects/ClimateEducation/Data/npr.xlsx',index = False, header=True)
else:
    print('No need to save because answer was: ' + x)

In [None]:
npr_topics

# Geoparsing 
## We perform now some analysis based on countries and continents. 
* First we import the geo parser for geograpghical entity recognition. We use geotext
* We allocate the countries to the whole dataframe
* We then assign country codes and continents to a slice of the dataframe
* We work with this new dataframe to produce maps, spider diagrams etc...

In [None]:
!pip install geotext

In [None]:
#Trying GeoText. It's not able to recognise when the same countries is talked more than once in the same abstract. 
#This needs to be manually implemented in the dataframe. 
from geotext import GeoText
#places = GeoText("London is a great city")
#places.cities
# "London"
#places.country_mention
from pycountry_convert import  country_alpha2_to_continent_code, country_alpha3_to_country_alpha2,country_name_to_country_alpha2,country_name_to_country_alpha3

In [None]:
#Parse countries in the Abstracts
geo_data_country = (npr_2008['Abstract']
#      .replace(r"\bUSA\b", "United States", regex=True)
       .replace((r"\bUSA\b", r"\bUS\b", r"\bU.S.\b") ,"United States",regex=True)      
       .replace((r"\bUK\b", r"\bU.K.\b") ,"United Kingdom",regex=True)                     
       .apply(lambda x: GeoText(x).countries)
)
geo_data_country.info()

In [None]:
geo_data_country = geo_data_country.rename('Countries')

In [None]:
#Add countries as a list to the dataframe column countries
npr_topics['countries'] = geo_data_country

In [None]:
print("Do you want to save the datasets?")
x = input()
if x == 'yes':
    npr_2008.to_csv('/Users/vmuccion/Documents/Projects/ClimateEducation/Output_Data/npr_2008_GPT2_Geo_October2023.csv')
    npr_2008.to_excel('/Users/vmuccion/Documents/Projects/ClimateEducation/Output_Data/npr_2008_GPT2_Geo_October2023.xlsx',index = False, header=True)
#    npr.to_csv('/Users/vmuccion/Documents/Projects/ClimateEducation/Data/npr.csv')
#    npr.to_excel('/Users/vmuccion/Documents/Projects/ClimateEducation/Data/npr.xlsx',index = False, header=True)
else:
    print('No need to save because answer was: ' + x)

In [None]:
print("Do you want to save the datasets?")
x = input()
if x == 'yes':
    npr_topics.to_csv('../../Data/ClimateEducation/Data_September_2024/npr_2008_GPT2_Geo_September2024.csv')
    #npr_topics.to_excel('/Users/vmuccion/Documents/Projects/ClimateEducation/Output_Data/npr_topics_GPT2_Geo_October2023.xlsx',index = False, header=True)
#    npr.to_csv('/Users/vmuccion/Documents/Projects/ClimateEducation/Data/npr.csv')
#    npr.to_excel('/Users/vmuccion/Documents/Projects/ClimateEducation/Data/npr.xlsx',index = False, header=True)
else:
    print('No need to save because answer was: ' + x)

In [None]:
npr_topics

In [None]:
import matplotlib.ticker as mtick

In [None]:
#The result in this figure correspond to the proportion of paper mentioning a given country out of the total of the papers 
# mentioning any country. The total of the paper mentioning a country or more in its abstract is 2261 (see below). 
ax = (npr_topics['countries'].explode().reset_index().drop_duplicates(keep = "first")['countries'].value_counts(ascending=False, normalize=True)*100).head(33).plot.barh(figsize=(10,8))
ax.set(xlabel=None)
ax.set_xticklabels(ax.get_xticklabels(), fontsize=12)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=12)
#ax.tick_params(axis='both', labelsize=12)
ax.xaxis.set_major_formatter(mtick.PercentFormatter(decimals = 0))
ax.grid(axis = 'y', linestyle = '--', linewidth = 0.5)
ax.spines["bottom"].set_visible(True)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(True)

plt.savefig('../../Data/ClimateEducation/Figures_092024/Publications_countries.eps', format = 'eps',dpi='figure',
        bbox_inches='tight', pad_inches=0.1,
        facecolor='auto', edgecolor='auto',
        backend=None)

In [None]:
npr_topics['countries'].explode().reset_index().drop_duplicates(keep = "first")['countries'].value_counts(ascending=False)

In [None]:
test = npr_topics['countries'].explode().reset_index().drop_duplicates(keep = "first")

In [None]:
test = test.set_index('index')

In [None]:
test = pd.concat([test,npr_topics['Topic Label']], axis=1)

In [None]:
test = test.dropna()

In [None]:
test = test.reset_index()

In [None]:
test = test.drop('index',axis =1)

In [None]:
sub_test = test[test.groupby('countries').countries.transform('count')>=20].copy() 

In [None]:
sub_test.head(20)

In [None]:
sub_test['countries'].nunique()

In [None]:
test['countries'].value_counts()

In [None]:
sub_test['countries'].value_counts()

In [None]:
sub_test[sub_test['Topic Label'] == 'Science_Research']['countries'].value_counts()

In [None]:
df_plot = sub_test.groupby(['Topic Label', 'countries']).size().reset_index().pivot(columns='Topic Label', index='countries', values=0)

In [None]:
df_plot.info()

In [None]:
df_plot = df_plot.fillna(0)

In [None]:
df_plot = df_plot.iloc[:,:-3]
#df_plot = df_plot.iloc[:,:]

In [None]:
df_plot.columns = df_plot.columns.get_level_values(0)

In [None]:
df_plot

In [None]:
df_plot =df_plot.astype(int)

In [None]:
fig, ax = plt.subplots(figsize=(15,7))
ax = sns.heatmap(df_plot.drop('Physical_Geography', axis=1).T, ax=ax, linecolor='white',annot = True,linewidths=2,cmap="crest", cbar = False)
plt.savefig('../../Data/ClimateEducation/Figures_092024/Country_Topic_association.svg', format = 'svg',dpi='figure',
        bbox_inches='tight', pad_inches=0.1,
        facecolor='auto', edgecolor='auto',
        backend=None)

In [None]:
#We redo the sns heatmap plan due to a bug in Seaborne. We save the files and 
#use a different notebook to avoid having to restart this one.

aa1.to_csv('../../Data/ClimateEducation/Data_September_2024/data_for_heatmap.csv')
df_plot.to_csv('../../Data/ClimateEducation/Data_September_2024/data_for_country_counts.csv')

In [None]:
#npr_new.drop_duplicates(keep = "first").head(20)

In [None]:
#npr_new.groupby(['index', 'countries']).first()

In [None]:
df10 = npr_topics['countries'].explode().reset_index()

In [None]:
#for i, row in df10.iterrows():    
code = []
for p in df10.countries:
    try:
        aa = country_name_to_country_alpha3(p)
        code.append(aa)
    except:
        code.append('')
df10['code'] = code

In [None]:
#npr_2008.drop('Country_0', axis = 1, inplace = True)

In [None]:
#npr_2008['continent'].value_counts(ascending=False).head(20).plot.barh(figsize=(10,8)) #autopct='%.2f')

In [None]:
#npr_2008['continent'].value_counts(ascending=False)

In [None]:
npr_topics['countries'].explode().reset_index().drop_duplicates(keep = "first")['countries'].value_counts(ascending=False).head(20)

In [None]:
npr_topics['countries'].explode().reset_index().drop_duplicates(keep = "first")['countries'].value_counts(ascending=False).sum()

In [None]:
npr_topics['countries'].shape

In [None]:
#Proportion of papers mentioning specific countries
(2261/5705)*100

In [None]:
df8 = npr_topics['countries'].explode().reset_index().drop_duplicates(keep = "first")['countries'].value_counts(ascending=False).rename_axis('countries').to_frame('counts')


In [None]:
npr_topics['countries'].explode().reset_index().drop_duplicates(keep = "first")['countries']

In [None]:
df8.reset_index(inplace=True)

In [None]:
df8['countries'].iloc[100:157]

In [None]:
#for i, row in df10.iterrows():    
code = []
continents = []
for p in df8.countries:
    try:
        aa = country_name_to_country_alpha3(p)
        b1 = country_name_to_country_alpha2(p)
        bb = country_alpha2_to_continent_code(b1)
        code.append(aa)
        continents.append(bb) 
    except:
        code.append('')
        continents.append('')
df8['code'] = code
df8['continents'] = continents

In [None]:
df8.to_csv("check_countries.csv")  

In [None]:
df8['code'].iloc[56] = 'ATA'

In [None]:
df8['continents'].iloc[56] = 'AQ'

In [None]:
df8['code'].iloc[126] = 'PSE'

In [None]:
df8['continents'].iloc[126] = 'AS'

In [None]:
df8['code'].iloc[103] = 'XK'

In [None]:
df8['continents'].iloc[103] = 'EU'

In [None]:
df8.iloc[0:50,:]

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import kaleido

fig = go.Figure(data=go.Choropleth(
    locations = df8['code'],
    z = df8['counts'],
    text = df8['countries'],
    colorscale = 'sunsetdark',
    autocolorscale=False,
    reversescale=False,
    #marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_tickprefix = '',
    colorbar_title = 'Country count', 
))
fig.update_geos(projection_type="natural earth", showcountries = True,showlakes = False,
                showocean=True, oceancolor="LightBlue")

fig.update_layout(title_text = 'Geographical distributions of case studies')
                                                                                                               
fig.show()

fig.write_image("../Figure_GPT2_8/Map_2.svg")

#plt.savefig('../Figure_GPT2_6/Map_2.eps', dpi=400)

In [None]:
aa = df8.groupby(df8['continents']).sum()
ax = aa.plot.pie(subplots=True,figsize=(7,7),legend = True, autopct='%1.1f%%', labeldistance = None)
plt.ylabel(None)
plt.title('Studies per continent',fontsize =16, loc="left")
plt.legend(bbox_to_anchor=(1.1, 0.8), fontsize = 12)
figure_name = 'Pie_Chart_Continents'
plt.savefig('../Figure_GPT2_8/Pie_continents.eps', dpi='figure',
        bbox_inches='tight', pad_inches=0.1,
        facecolor='auto', edgecolor='auto',
        backend=None)
plt.show()

In [None]:
aa.index

In [None]:
my_continets = {'AF': 'Africa', 'AQ':'Antarctica', 'AS': 'Asia',
               'EU': 'Europe', 'NA':'North America', 'OC': 'Oceania','SA': 'South America'}

In [None]:
aa.index = aa.index.map(my_continets)

In [None]:
aa

In [None]:
aa['counts'][i]

In [None]:
# Create a figure and axis
fig, ax = plt.subplots(figsize=(10, 10),subplot_kw=dict(aspect="equal"))

wedges, texts = ax.pie(aa['counts'], wedgeprops=dict(width=0.5))
bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
kw = dict(arrowprops=dict(arrowstyle="-"),
          bbox=bbox_props, zorder=0, va="center")

for i, p in enumerate(wedges):
    ang = (p.theta2 - p.theta1)/2. + p.theta1
    y = np.sin(np.deg2rad(ang))
    x = np.cos(np.deg2rad(ang))
    horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
    connectionstyle = f"angle,angleA=0,angleB={ang}"
    kw["arrowprops"].update({"connectionstyle": connectionstyle})
    ax.annotate((aa.index[i],aa['counts'][i]), xy=(x, y), xytext=(1.2*np.sign(x), 1*y),
                horizontalalignment=horizontalalignment, **kw, fontsize=14)

#ax.set_title("Matplotlib bakery: A donut")
figure_name = 'Pie_Chart_Continents'
plt.savefig('../Figure_GPT2_8/Donuts_continents.eps', dpi='figure',
        bbox_inches='tight', pad_inches=0.1,
        facecolor='auto', edgecolor='auto',
        backend=None)
plt.show()

In [None]:
df12 = df8.groupby(df8['continents']).sum()

In [None]:
df12.reset_index(inplace=True)

In [None]:
full_name = ['Africa', 'Antarctica', 'Asia', 'Europe', 'N. America', 'S. America', 'Oceania'] 
df12['Continent_Full_Name'] = full_name

In [None]:
df12

In [None]:
fig = plt.figure(figsize=(6,8))
ax = fig.add_subplot(111, projection="polar")

# theta has 7 different angles, and the first one repeated
theta = np.arange(len(df12) + 1) / float(len(df12)) * 2 * np.pi
# values has the 7 values from 'counts', with the first element repeated
values = df12['counts'].values
values = np.append(values, values[0])

# draw the polygon and the mark the points for each angle/value combination
l1, = ax.plot(theta, values, color="C2", marker="o", label="counts")
plt.xticks(theta[:-1], df12['Continent_Full_Name'], color='black', size=10)
ax.tick_params(pad=20,labelrotation =0) # to increase the distance of the labels to the plot
# fill the area of the polygon with green and some transparency
#ax.fill(theta, values, 'blue', alpha=0.1)

#plt.legend() # shows the legend, using the label of the line plot (useful when there is more than 1 polygon)
#plt.title("Title")
plt.show()

In [None]:
df10 = npr_topics[npr_topics['Topic'] == 1]['countries'].explode().reset_index().drop_duplicates(keep = "first")['countries'].value_counts(ascending=False).rename_axis('countries').to_frame('counts')


In [None]:
from collections import Counter
def get_keys(topic_matrix):
    '''
    returns an integer list of predicted topic 
    categories for a given topic matrix
    '''
    keys = topic_matrix.argmax(axis=1).tolist()
    return keys

def keys_to_counts(keys):
    '''
    returns a tuple of topic categories and their 
    accompanying magnitudes for a given list of keys
    '''
    count_pairs = Counter(keys).items()
    categories = [pair[0] for pair in count_pairs]
    counts = [pair[1] for pair in count_pairs]
    return (categories, counts)

In [None]:
topic_results.shape

In [None]:
nmf_keys = get_keys(topic_results)
nmf_categories, nmf_counts = keys_to_counts(nmf_keys)

In [None]:
def get_top_n_words(n_top_words, count_vectorizer, text_data):
    '''
    returns a tuple of the top n words in a sample and their 
    accompanying counts, given a CountVectorizer object and text sample
    '''
    vectorized_headlines = count_vectorizer.fit_transform(text_data.values)
    vectorized_total = np.sum(vectorized_headlines, axis=0)
    word_indices = np.flip(np.argsort(vectorized_total)[0,:], 1)
    word_values = np.flip(np.sort(vectorized_total)[0,:],1)
    
    word_vectors = np.zeros((n_top_words, vectorized_headlines.shape[1]))
    for i in range(n_top_words):
        word_vectors[i,word_indices[0,i]] = 1

    words = [word[0].encode('ascii').decode('utf-8') for 
             word in count_vectorizer.inverse_transform(word_vectors)]

    return (words, word_values[0,:n_top_words].tolist()[0])

In [None]:
len(nmf_keys)

In [None]:
n_topics = 15

In [None]:
# Define helper functions
def get_top_n_words(n, keys, document_term_matrix, count_vectorizer):
    '''
    returns a list of n_topic strings, where each string contains the n most common 
    words in a predicted category, in order
    '''
    top_word_indices = []
    for topic in range(n_topics):
        temp_vector_sum = 0
        for i in range(len(keys)):
            if keys[i] == topic:
                temp_vector_sum += document_term_matrix[i]
        temp_vector_sum = temp_vector_sum.toarray()
        top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0)
        top_word_indices.append(top_n_word_indices)   
    top_words = []
    for topic in top_word_indices:
        topic_words = []
        for index in topic:
            temp_word_vector = np.zeros((1,document_term_matrix.shape[1]))
            temp_word_vector[:,index] = 1
            the_word = count_vectorizer.inverse_transform(temp_word_vector)[0][0]
            topic_words.append(the_word.encode('ascii','ignore').decode('utf-8'))
        top_words.append(" ".join(topic_words))         
    return top_words

In [None]:
top_n_words_nmf = get_top_n_words(2, nmf_keys, dtm,tfidf)

for i in range(len(top_n_words_nmf)):
    print("Topic {}: ".format(i+1), top_n_words_nmf[i])

In [None]:
# Define helper functions
def get_mean_topic_vectors(keys, two_dim_vectors):
    '''
    returns a list of centroid vectors from each predicted topic category
    '''
    mean_topic_vectors = []
    for t in range(n_topics):
        articles_in_that_topic = []
        for i in range(len(keys)):
            if keys[i] == t:
                articles_in_that_topic.append(two_dim_vectors[i])    
        
        articles_in_that_topic = np.vstack(articles_in_that_topic)
        mean_article_in_that_topic = np.mean(articles_in_that_topic, axis=0)
        mean_topic_vectors.append(mean_article_in_that_topic)
    return mean_topic_vectors

In [None]:
tsne_lsa_vectors_2 = tsne_lsa_model.fit_transform(topic_results)

In [None]:
nmf_mean_topic_vectors = get_mean_topic_vectors(nmf_keys, tsne_lsa_vectors_2)

In [None]:
tsne_lsa_vectors[:]

In [None]:
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook


In [None]:
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5" ])
colormap = colormap[:n_topics]

In [None]:
plot = figure(title="t-SNE Clustering of {} NMF Topics".format(n_topics), width=1000, height=1000)
plot.scatter(x=tsne_lsa_vectors_2[:,0], y=tsne_lsa_vectors_2[:,1], color=colormap[nmf_keys])

for t in range(n_topics):
    label = Label(x=nmf_mean_topic_vectors[t][0], y=nmf_mean_topic_vectors[t][1], 
                  text=top_n_words_nmf[t], text_color='black', text_align='center', x_offset=0, y_offset=-8)
    plot.add_layout(label)

show(plot)
#colormap[t]

In [None]:
### Let's run some historical research on the overall dataframe

In [None]:
npr

In [None]:
npr['Abstract_british'] = npr['Abstract'].apply(lambda x: britishize(x))
npr['abstract_lemmatized'] = npr['Abstract_british'].apply(lambda row: " ".join([w.lemma_ for w in nlp(row)]))

In [None]:
target = 'datum'
plural = 'data'
npr['abstract_lemmatized'] = npr['abstract_lemmatized'].str.replace(target, plural)

In [None]:
target = 'programmeme'
plural = 'programme'
npr['abstract_lemmatized'] = npr['abstract_lemmatized'].str.replace(target, plural)

In [None]:
npr2010 = npr[(npr['PubYear']>=1990) & (npr['PubYear'] <=2010)]
npr2023 = npr[(npr['PubYear']>2010) & (npr['PubYear'] <=2023)]

In [None]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, ngram_range=(1,2), stop_words=list(stop_words_2))

In [None]:
def word_weights(abstracts,file_name):
    dtm = tfidf.fit_transform(abstracts)
    tfidf_weights = [(word, dtm.getcol(idx).sum()) for word, idx in tfidf.vocabulary_.items()]
    feature_names = tfidf.get_feature_names_out()
    weights = np.asarray(dtm.mean(axis=0)).ravel().tolist()
    weights_df = pd.DataFrame({'term': tfidf.get_feature_names_out(), 'weight': weights})
    weights_df = weights_df.sort_values(by='weight', ascending=False).head(200)
    w = WordCloud(width=1000, height=800, mode='RGBA', background_color='white', random_state=42,colormap='viridis', max_words=500).fit_words(dict(tfidf_weights))    
    file = weights_df.to_excel(file_name, index = False, header=True)
    return weights_df,w

In [None]:
file_name_2010 ='../../Data/ClimateEducation/Data_September_2024/term_weight_2010_gram12.xlsx'
weights_df_2010, w2010 = word_weights(npr2010['abstract_lemmatized'],file_name_2010)

In [None]:
file_name_2023 ='../../Data/ClimateEducation/Data_September_2024/term_weight_2023_gram12.xlsx'
weights_df_2023,w2023 = word_weights(npr2023['abstract_lemmatized'],file_name_2023)

In [None]:
# Create a figure with two subplots: one for the bar plot and one for the line plot
fig, ax1 = plt.subplots(1, 2, figsize=(20, 18))  # 2 rows, 1 column

# **Bar Plot** (First subplot)
#topic_sums_sorted = topic_sums.sort_values(by='percentage', ascending=False)
sns.barplot(x='weight',y = 'term',data = weights_df_2010.head(100), palette='coolwarm_r', ax=ax1[0])
ax1[0].set_title('Word frequency 1990-2010', fontsize=18)
ax1[0].set_xlabel('Weights', fontsize=14)
ax1[0].set_ylabel('Words', fontsize=14)
ax1[0].tick_params(axis='y', labelsize=12)
ax1[0].tick_params(axis='x', labelsize=12)


# **Bar Plot** (second subplot)
#topic_sums_sorted = topic_sums.sort_values(by='percentage', ascending=False)
sns.barplot(x='weight',y = 'term',data = weights_df_2023.head(100), palette='coolwarm_r', ax=ax1[1])
ax1[1].set_title('Word frequency 2011-2023', fontsize=18)
ax1[1].set_xlabel('Weights', fontsize=14)
ax1[1].set_ylabel('Words', fontsize=14)
ax1[1].tick_params(axis='y', labelsize=12)
ax1[1].tick_params(axis='x', labelsize=12)

# Adjust layout to prevent overlap
plt.tight_layout()

# Save the combined figure
plt.savefig('../../Data/ClimateEducation/Figures_092024/WordFrequency_barplot.eps', 
            format='eps', dpi='figure',
            bbox_inches='tight', pad_inches=0.2,
            facecolor='auto', edgecolor='auto')

# Show the combined figure
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15, 8))
csfont = {'fontname':'Arial'}
hfont = {'fontname':'Arial'}

# Plot number 1 setup
title1 = 'Word frequency 1990-2010'
ax1.imshow(w2010, aspect='auto')  # Use ax1 to plot the first word cloud
ax1.set_title(title1, **csfont)  # Set title with custom font
ax1.axis('off')  # Turn off the axis

# Plot number 2 setup
title2 = 'Word frequency 2011-2023'
ax2.imshow(w2023, aspect='auto')  # Use ax2 to plot the second word cloud
ax2.set_title(title2, **csfont)  # Set title for the second word cloud
ax2.axis('off')  # Turn off the axis

# Adjust the subplot layout
plt.tight_layout(pad=5)

# Save the combined figure
plt.savefig('../../Data/ClimateEducation/Figures_092024/WordFrequency_ngrams12_time.svg', 
            format='svg', dpi='figure',
            bbox_inches='tight', pad_inches=0.2,
            facecolor='auto', edgecolor='auto')

# Show the combined figure
plt.show()