In [None]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
stopwords = set(STOPWORDS)
import matplotlib.pyplot as plt

In [None]:
stopwords.update(["imag", "lena", "lenna", "image", "pictur", "picture", "img"])

In [None]:
def clean_data(df, textCol, cleanTextCol):

    # remove stopwords and non alphanumeric; lower case
    df[cleanTextCol] = df[textCol].apply(lambda x: x.lower())
    df[cleanTextCol] = df[cleanTextCol].apply(lambda x: " ".join(x for x in x.split() if x not in stopwords))
#     df[cleanTextCol] = df[cleanTextCol].str.replace('[^A-Za-z0-9 ]+','')    
    df[cleanTextCol] = df[cleanTextCol].str.replace('[^A-Za-z ]+','')   
    df[cleanTextCol] = df[cleanTextCol].str.replace('x', '')

    
    return df

In [None]:
# Load Data and drop duplicate links
df = pd.read_csv("data/all_final.csv")
df = df.drop_duplicates(subset=['url'])

df = df.dropna(subset=['domain_end', 'dataset'])

print(len(df))
display(df['descrip_lang'].value_counts().head())

In [None]:
# Filter for English

df = df[df['descrip_lang']=="en"]
print(len(df))
display(df['descrip_lang'].value_counts().head())

In [None]:
df = clean_data(df, 'description', 'tokens')

In [None]:
df['year'] = df['year'].apply(lambda x: str(x).split(".")[0])
df = df[df['year']!="nan"]
df = df[df['year']>="2000"]

In [None]:
df['tokens_first'] = df['tokens'].apply(lambda x: ' '.join(x.split()[:50]))
cols = ['dataset', 'url', 'title', 'domain', 'domain_end', 'year', 'tokens', 'tokens_first']
df = df[cols]

In [None]:
df.head(1)

In [None]:
df['year'].value_counts()

# STM

In [None]:
%load_ext rpy2.ipython
%R -i df
%R df$dataset <- as.factor(df$dataset)
%R df$domain_end <- as.factor(df$domain_end)
%R df$year <- as.factor(df$year)

In [None]:
%%R

library(stm)
library(dplyr)
library(broom)

processed <- textProcessor(df$tokens, metadata=df)

out <- prepDocuments(processed$documents, processed$vocab, processed$meta)
docs <- out$documents
vocab <-out$vocab
meta <- out$meta

print(levels(meta$domain_end))

In [None]:
%%R

mod.out <- stm(docs, vocab, K=5, prevalence=~domain_end+dataset+year, max.em.its=500, data=meta, init.type="Spectral", seed=1)

In [None]:
%%R
mod.out

In [None]:
%%R

plot(mod.out, type=c("summary"), labeltype=c("prob"), width=120)
plot(mod.out, type=c("labels"), labeltype=c("prob"), width=120)

plot(mod.out, type=c("summary"), labeltype=c("frex"))
plot(mod.out, type=c("labels"), labeltype=c("frex"), width=120)

plot(mod.out, type=c("summary"), labeltype=c("score"))
plot(mod.out, type=c("labels"), labeltype=c("score"), width=120)

In [None]:
%%R

mod.out.corr <- topicCorr(mod.out)
plot(mod.out.corr)

In [None]:
%%R
topicQuality(mod.out, docs, M=40)