# Install and import

In [1]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.5.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 3.3 MB/s eta 0:00:01
[?25hCollecting click
  Downloading click-7.1.2-py2.py3-none-any.whl (82 kB)
[K     |████████████████████████████████| 82 kB 952 kB/s  eta 0:00:01
Collecting regex
  Downloading regex-2020.9.27-cp37-cp37m-manylinux2010_x86_64.whl (662 kB)
[K     |████████████████████████████████| 662 kB 23.4 MB/s eta 0:00:01
[?25hCollecting tqdm
  Downloading tqdm-4.50.1-py2.py3-none-any.whl (70 kB)
[K     |████████████████████████████████| 70 kB 902 kB/s  eta 0:00:01
[?25hBuilding wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25ldone
[?25h  Created wheel for nltk: filename=nltk-3.5-py3-none-any.whl size=1434674 sha256=4ba9236d09d883dd1db381d09f762c14c0f2762467563de947f53c97d2ca5185
  Stored in directory: /home/jovyan/.cache/pip/wheels/45/6c/46/a1865e7ba706b3817f5d1b2ff7ce8996aabdd0d03d47ba0266
Successfully built nltk
Installing collected pa

In [2]:
!pip install networkx

Collecting networkx
  Downloading networkx-2.5-py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 3.3 MB/s eta 0:00:01
Installing collected packages: networkx
Successfully installed networkx-2.5
You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
import re
from nltk.tokenize import sent_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Load the data

In [4]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove*.zip
# Extract word vectors
word_embeddings = {}

f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [10]:
df = pd.read_csv('ug_processed.csv').reset_index(drop=True)

In [11]:
print(df.shape)
df.head()

(4789, 44)


Unnamed: 0.1,Unnamed: 0,index,level,country,is_online,first_survey,primary_mode,preferred_mode,why_mode,platforms_used,...,active,passive,fully_active,partly_active,outside_passive,outside_interactive,outside_assignments,is_online_short,num_methods,num_techniques
0,0,4,Undergraduate (studying for associates or bach...,United Kingdom,Yes,No,recorded,recorded,Easier to follow and look back on,"Google Meet, Emailed/Uploaded readings and ass...",...,False,True,False,False,True,True,False,Yes,1,2
1,1,6,Undergraduate (studying for associates or bach...,United Kingdom,Yes,No,live,live,Most similar to normality and provides a routine,"Zoom, Emailed/Uploaded readings and assignments",...,True,True,True,False,False,True,False,Yes,2,1
2,2,9,Undergraduate (studying for associates or bach...,United Kingdom,Yes,Yes,recorded,live,It is more interactive,"Zoom, Recorded Lectures/videos from previous y...",...,True,True,False,True,False,True,True,Yes,0,3
3,3,13,Undergraduate (studying for associates or bach...,United Kingdom,Yes,Yes,live,live,Allows more of a conversation to flow and ques...,"Zoom, Webex",...,True,False,True,False,False,True,False,Yes,1,1
4,4,15,Undergraduate (studying for associates or bach...,Czech Republic,Yes,Yes,live,live,,"Microsoft Teams, Emailed/Uploaded readings and...",...,True,True,True,False,True,True,True,Yes,2,4


In [22]:
why_mode = df['why_mode'].dropna().to_list()
why_preference = df['why_preference'].dropna().to_list()
remote_changes = df['remote_changes'].dropna().to_list()
prior_changes = df['prior_changes'].dropna().to_list()

remote_changes_sampled = remote_changes[:500]

# Build the model

## Text processing

In [13]:
# tokenize sentences
def sentence_tokenize(data):
    sentences = []
    for s in data:
        # break articles into sentences
        sentences.append(sent_tokenize(str(s)))
    sentences = [y for x in sentences for y in x] # flatten list
    return sentences

In [14]:
# remove punctuations, numbers and special characters, lowercase
def lower_abc(data):
    clean_sentences = pd.Series(data).str.replace('[^a-zA-Z]', ' ')
    clean_sentences = [s.lower() for s in clean_sentences]
    return clean_sentences

In [15]:
# function to remove stopwords
def remove_stopwords(sen):
    sen_new = ' '.join([i for i in sen if i not in stop_words])
    return sen_new

In [16]:
def removed_joined(data):
    return [remove_stopwords(r.split()) for r in data]

## Vectorization

In [17]:
# create vectors for sentences
# average dim of words
def vectorization(data):
    sentence_vectors = []
    for i in data:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((100,))) \
                    for w in i.split()])/(len(i.split())+0.001)
        else:
            v = np.zeros((100,))
        sentence_vectors.append(v)
    return sentence_vectors

## Apply PageRank algorithm

In [18]:
# similarity matrix
def sim_matrix(sentence_vectors, m):
    sim_mat = np.zeros([m, m])
    for i in range(m):
        for j in range(m):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), \
                                              sentence_vectors[j].reshape(1,100))[0,0]
    return sim_mat                    

In [19]:
# apply pagerank algorithm
def pagerank(sentences, sim_mat):
    top_content = []
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((scores[i],s) for i, s in enumerate(sentences)), reverse=True)
    return ranked_sentences

In [20]:
# summary extraction
def final(original_sentences):
    tokenized = sentence_tokenize(original_sentences)
    clean_sent = lower_abc(tokenized)
    clean_sent = removed_joined(clean_sent)
    vectorized = vectorization(clean_sent)
    sim_mat = sim_matrix(vectorized, len(tokenized))
    return pagerank(tokenized, sim_mat)

# Run the model

In [23]:
final(remote_changes_sampled)

[(0.0018175271869698795,
  'Have more time because it is not the same  if you are not used to online classes and sometimes it is difficult to understand some concepts that are really practical (and because of online classes you are not able to actually do so you can understand them better)'),
 (0.0018166444491600502,
  'Make the class sizes smaller on zoom lecturers so it’s easier to ask questions and have some sort of way to test our learning as we go rather than just having one big essay at the end to write'),
 (0.0018155555705278187,
  'Make it more engaging, it can be quite a one way learning system, more discussion'),
 (0.0018145564617524084,
  'Would be good if all online classes are made to be more interactive and allow for more thinking (instead of just absorbing of content).'),
 (0.0018142436881921824,
  'To have online lectures like we normally would do in person as we haven’t had any video call lectures on sites such as zoom at all and I am a third year student who has finis

In [None]:
# pd.Series(result).to_csv('remote_changes_theme')