In [227]:
# imports
import torch
from torchtext.data import get_tokenizer
from torchtext.vocab import GloVe

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from collections import Counter
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sklearn.utils import shuffle

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from sklearn.decomposition import PCA, TruncatedSVD


# setup
DATAPATH = 'smallchungus.csv' 
# device
device = 'cpu'

In [199]:
df = pd.read_csv(DATAPATH)
df = shuffle(df)
df.head()

Unnamed: 0.1,Unnamed: 0,head,Body ID,Stance,body
27636,23420,ISIS Militants Allegedly Contracted Ebola,1351,unrelated,"There may be yet another bug in iOS 8, Apple's..."
13167,11979,"Confusion swirls, details murky in arrest of I...",720,discuss,DNA tests have confirmed that a daughter and a...
490,29334,Dog found abandoned at Scottish train station ...,1642,agree,The Scottish SPCA is appealing to the public f...
30499,26318,Was Alleged Audio of Michael Brown Shooting on...,1481,unrelated,"The bullet ricochets off the man's helmet, pro..."
29693,25669,Isis 'fed murdered kidnap victim to his own mo...,1442,unrelated,Updates at 5:10 p.m.\n\nSoldier killed in War ...


In [200]:
# takes in string & returns a cleaned string of all non-stop-words
def preprocess(text, lemmatizer = WordNetLemmatizer()):
    sw = set(stopwords.words('english'))
    text = re.sub(r'[^\w\s]', '', text).lower()
    s = ""
    for word in text.split():
        if word not in sw:
                s += (lemmatizer.lemmatize(word) + " ")
    return s

In [4]:
tokenizer = get_tokenizer("basic_english")
max_words = 20
embed_len=300
global_vectors = GloVe(name='840B', dim=embed_len)

In [223]:
# takes string, returns 6000 dim GloVe vector
def to_vector(s):
    X = tokenizer(preprocess(s))
    # fill / cut tokens to max size
    if len(X) < max_words:
        X = X+[""]*(max_words-len(X))
    else:
        X = X[:max_words]

    X_tensor = torch.zeros(1, max_words, embed_len)
    for i, j in enumerate(X):
        X_tensor[0][i] = global_vectors.get_vecs_by_tokens(j)
    return(X_tensor.reshape(1, -1))

def combined_vector(x, y):
    x = to_vector(x)
    y = to_vector(y)
    return torch.cat((x, y), 1).numpy()

def cosim(x, y):
    x = to_vector(x)
    y = to_vector(y)
    cosim = cosine_similarity(x, y)
    return cosim[0]

In [203]:
def stance_to_colour(r):
    s = r["Stance"]
    stance_map = {
                "agree": "lime",
                "disagree": "red",
                "discuss": "blue",
                "unrelated": "grey"
            }
    y_data = stance_map[s]
    return y_data

In [231]:
# test
#targets = [combined_vector(df["head"][i], df["body"][i])[0] for i in range(3000)]
targets = [to_vector(df["body"][i])[0].numpy() for i in range(3000)]
#cs = [cosim(df["head"][i], df["body"][i])[0] for i in range(3000)]

In [216]:
colour_labels = []
for i in range(3000):
    colour_labels.append(stance_to_colour(df.iloc[i]))

In [226]:

def cv(data):
    count_vectorizer = CountVectorizer()
    emb = count_vectorizer.fit_transform(data)
    return emb, count_vectorizer

def plot_LSA(test_data, test_labels, savepath="PCA_demo.csv", plot=True):
        lsa = TruncatedSVD(n_components=2)
        lsa.fit(test_data)
        lsa_scores = lsa.transform(test_data)
        color_mapper = {label:idx for idx,label in enumerate(set(test_labels))}
        color_column = [color_mapper[label] for label in test_labels]
        colors = ["lime", "red", "blue", "grey"]
        if plot:
            plt.scatter(lsa_scores[:,0], lsa_scores[:,1], s=8, alpha=.8, c=test_labels, cmap=matplotlib.colors.ListedColormap(colors))
            green_patch = mpatches.Patch(color='lime', label='agree')
            red_patch = mpatches.Patch(color="red", label='disagree')
            blue_patch = mpatches.Patch(color="blue", label='discuss')
            grey_patch = mpatches.Patch(color="grey", label='unrelated')
            plt.legend(handles=[green_patch, red_patch, blue_patch, grey_patch], prop={'size': 30})

train_counts, count_vectorizer = cv(df["head"])
fig = plt.figure(figsize=(16, 16))          
plot_LSA(targets, colour_labels)
plt.show()


ValueError: Expected 2D array, got 1D array instead:
array=[0.27138162 0.10205631 0.11079711 ... 0.14447063 0.20737205 0.16762617].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

<Figure size 1152x1152 with 0 Axes>