# 1. Edit distances and NLTK

## Levensthein distance

Any string can be turned into another string by inserting, deleting, or substituting characters. The Levenshtein distance between two strings counts the minimum number of edits needed to do this.

In [13]:
import nltk
#nltk.download('words')
from nltk.corpus import wordnet as wn
import pandas as pd
import networkx as nx
from nltk.metrics import edit_distance
from nltk.metrics import jaccard_distance
from nltk.corpus import words
from random import sample
import seaborn as sns
sns.set()

In [14]:
word_list = words.words()

samples = sample(word_list, 20)

In [15]:
distances = [] #Calculates edit distances

for i in samples:
    dist = []
    for j in samples:
        dist.append(edit_distance(i, j))
    distances.append(dist)

df = pd.DataFrame(distances)
df.columns = samples
df.index = samples

In [None]:
sns.heatmap(df)

## Jaccard distance

Jaccard distance derives from a related measure called Jaccard similarity. Jaccard similarity measures how similar two sets are to one another. It is defined as the intersection of the two sets divided by their union.

In [None]:
text_1 = "The world is all that is the case"
text_2 = "The sun shone, having no alternative, on the nothing new"
text_3 = "It was a bright cold day in April, and the clocks were striking thirteen"

texts = [set(text_1.split(" ")), set(text_2.split(" ")), set(text_3.split(" "))]

j_distances = []

for i in texts:
    dist = []
    for j in texts:
        dist.append(jaccard_distance(i, j))
    j_distances.append(dist)

df = pd.DataFrame(j_distances)
df.columns = ["Text 1", "Text 2", "Text 3"]
df.index = ["Text 1", "Text 2", "Text 3"]

sns.heatmap(df)

# 2. Stems and Lemmas

In [None]:
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk import stem
stemmer = stem.PorterStemmer()
from nltk import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))
import string
punct = list(string.punctuation)
from collections import Counter

In [25]:
sent = "The leaves on the ground started to decompose as they were left \
        untouched, and the leaf blower remained unused in the shed."

words = word_tokenize(sent)
stems = [stemmer.stem(i) for i in words]
lemmas = [lemmatizer.lemmatize(i) for i in words if i not in punct]

# 3. APIs for data acquisition

In [39]:
import praw
import datetime
from getpass import getpass
password = getpass("Enter your password: ")

Enter your password:  ········


In [38]:
reddit = praw.Reddit(user_agent='VAD',
                     client_id='eCo_TWE_BA_zFA', client_secret="1gsqXgMrZBoQBVYf40hgtvMS_Ro",
                     username='textureai', password=password)

In [40]:
def submission(submission_id): ## submission_id can be URL or submission ID
    try:
        submission = reddit.submission(url = submission_id)
    except:
        submission = reddit.submission(submission_id)
    title = submission.title
    submission.comments.replace_more() ## loads new page if cooments are multipage
    text = [i.body for i in submission.comments]
    score = [i.score for i in submission.comments]
    user = [i.author for i in submission.comments]
    date = [datetime.datetime.fromtimestamp(i.created) for i in submission.comments]
    df = pd.DataFrame()
    df['text'] = text
    df['datetime'] = date
    df['score'] = score
    df['subreddit'] = submission.subreddit
    df['redditor'] = user
    df['type'] = 'comment'
    df['title'] = title
    df = df.sort_values('score', ascending = False).reset_index(drop = True)
    return df

In [41]:
df = submission('https://www.reddit.com/r/AskReddit/comments/19ewqco/what_is_the_worst_reply_to_im_leaving_you/')

# 4. Word norms and sentiment

In [None]:
import plotly.express as px

vad = pd.read_excel('vad.xlsx', index_col = 0)  #VAD norms
sm = pd.read_excel('sensorimotor.xlsx', index_col = 0) #Sensorimotor norms
sm = sm[['auditory', 'gustatory', 'haptic', 'interoceptive', 'olfactory',
       'visual', 'foot_leg', 'hand_arm', 'head', 'mouth', 'torso']]

In [None]:
vad_s = vad.sample(500)

fig = px.scatter_3d(vad_s, x='valence', y='arousal', z='dominance', hover_data = [vad_s.index])
fig.update_traces(marker=dict(size=5,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.show("notebook")

In [None]:
text = "Spiders, crime, and earthquakes haunt my nightmares"
tokens = word_tokenize(text)
lemmas = [lemmatizer.lemmatize(i.lower()) for i in tokens]

words = []
emo = []

for i in lemmas:
    if i in vad.index:
        emo.append(vad.loc[i])
        words.append(i)
    else:
        pass

### 5. Document representation using TF-IDF

In [43]:
import spacy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from IPython.display import IFrame
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from scipy.spatial import distance

In [49]:
vectorizer = TfidfVectorizer(input = 'content', strip_accents = 'ascii', stop_words = 'english')

D = ["The quick brown fox jumps over the lazy dog.",
"Lazy pandas eat, sleep, and play all day without a care.",
"Foxes are known for their quick movements and cunning nature.",
"Unlike the active fox, the sloth is a very lazy animal.",
"The quick hare was no match for the tortoise in the end."]

v = vectorizer.fit_transform(D)
v = v.todense().tolist()

d = pd.DataFrame(
    v,columns=vectorizer.get_feature_names_out())
d.index = ['d1', 'd2', 'd3', 'd4', 'd5']

In [50]:
d

Unnamed: 0,active,animal,brown,care,cunning,day,dog,eat,end,fox,...,match,movements,nature,pandas,play,quick,sleep,sloth,tortoise,unlike
d1,0.0,0.0,0.468913,0.0,0.0,0.0,0.468913,0.0,0.0,0.378316,...,0.0,0.0,0.0,0.0,0.0,0.314037,0.0,0.0,0.0,0.0
d2,0.0,0.0,0.0,0.393795,0.0,0.393795,0.0,0.393795,0.0,0.0,...,0.0,0.0,0.0,0.393795,0.393795,0.0,0.393795,0.0,0.0,0.0
d3,0.0,0.0,0.0,0.0,0.428411,0.0,0.0,0.0,0.0,0.0,...,0.0,0.428411,0.428411,0.0,0.0,0.286912,0.0,0.0,0.0,0.0
d4,0.442832,0.442832,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.357274,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.442832,0.0,0.442832
d5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.474125,0.0,...,0.474125,0.0,0.0,0.0,0.0,0.317527,0.0,0.0,0.474125,0.0


In [None]:
distances = [[] for i in range(len(d))]

for i in range(len(d)):
    for j in range(len(d)):
        distances[i].append(distance.cosine(d.iloc[i], d.iloc[j]))
        
dist_df = pd.DataFrame(distances, columns = d.index, index = d.index)

sns.heatmap(dist_df)

# 6. Word embeddings

In [None]:
import gensim
import gensim.downloader as api
from sklearn.decomposition import PCA

with open('shake.txt', 'r') as f:
    shake = f.read()

# Get rid of newline characters and non-ascii gibberish and make everything lowercase
shake = shake.encode('ascii', 'ignore')
shake = shake.decode()
shake = ' '.join(shake.splitlines())
shake = shake.lower()

shake = shake.split('.')
shake = [i.strip() for i in shake]

shake_tokens = [word_tokenize(i) for i in shake]

shake_lemmas = [[] for i in range(len(shake_tokens))]

for i in range(len(shake_tokens)):
    for j in shake_tokens[i]:
        if j not in stops and j not in punct:
            shake_lemmas[i].append(lemmatizer.lemmatize(j))