In [15]:
import sys, os
import nltk
import pandas as pd

In [16]:
from collections import defaultdict
from gensim import corpora, models, similarities
def make_dictionary(documents):
    """
    construct a dictionary, i.e. mapping btwn word ids and their freq of occurence in the whole corpus
    filter dictionary to remove stopwords and words occuring < min_count times
    
    input: documents is an iterable consisting of all the words in the corpus 
    output: filtered dictionary
    """

    
    dictionary = corpora.Dictionary(documents)

    stop_words = nltk.corpus.stopwords.words('english') 
    min_count = 2
    stop_ids = [dictionary.token2id[word] for word in stop_words
               if word in dictionary.token2id]
    rare_ids = [id for id, freq in dictionary.dfs.items()
                if freq < min_count]
    dictionary.filter_tokens(stop_ids + rare_ids)
    dictionary.compactify()
    return(dictionary)

def make_corpus(documents):
    """
    """
    dictionary = make_dictionary(documents)
    # convert corpus to vectors using bag-of-words representation, i.e. tuples of word indices and word counts
    corpus = [dictionary.doc2bow(words) for words in documents]
    return(corpus, dictionary)

def make_lsi_similarity_matrix(tfidf_corpus, dictionary):
    """
    construct LSI (latent semantic indexing) model on Tfidf-transformed corpus, print model topics, 
    return similarity matrix.
    """
    # construct model
    lsi = models.lsimodel.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=1000) 
    lsi.save('lsi-model.save')
    # create similarity matrix
    matsim = similarities.MatrixSimilarity(lsi[tfidf_corpus], num_best=1000)
    return(matsim)

def make_lda_similarity_matrix(corpus, dictionary):
    """
    Latent Dirichlet Allocation (LDA) model
    """
    # construct model
    lda = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=100)
    lda.save('lda-model.save')
    # create similarity matrix
    matsim = similarities.MatrixSimilarity(lda[corpus], num_best=1000)
    return(matsim)

In [17]:
# Read database of data
os.chdir('../data')

output_fname="articles-n-forums-posts.csv"

# Read articles from file
input_fname="AutismParentMagazine-posts-clean.csv"
df=pd.read_csv(input_fname,index_col=0)
df.index.name='post id'
df.head(1)



Unnamed: 0_level_0,title,source,category,text,href
post id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"Autism, Head Banging and other Self Harming Be...",https://www.autismparentingmagazine.com/,['category-applied-behavior-analysis-aba'],For children with autism spectrum disorder (AS...,https://www.autismparentingmagazine.com/autism...


In [18]:
# Read articles from file
input_fname="ehealthforum-posts-clean.csv"
df2=pd.read_csv(input_fname,index_col=0)
df2.index.name='post id'
df2['source']='http://ehealthforum.com'
df2['category']='forums'
df2.head(1)


Unnamed: 0_level_0,title,text,href,user id,mother post id,source,category
post id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Possible autism signs ?,my nephew who is four years has very bad mood ...,http://ehealthforum.com/health/possible-autism...,55473.0,0,http://ehealthforum.com,forums


In [19]:
input_fname="MedHelp-posts-clean.csv"
df3=pd.read_csv(input_fname,index_col=0)
df3['source']='http://www.medhelp.org'
df3['category']='forums'

#
#Remove questions from forum:
df3=df3.drop(df3.loc[df3.index == df3['mother post id']].index)
del df3['mother post id']
del df3['user id']
df3.head(1)

Unnamed: 0_level_0,title,text,href,source,category
post id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,Inappropriate Masterbation Down Syndrome,"A related discussion, self injusry in masturba...",http://www.medhelp.org//posts/Autism--Asperger...,http://www.medhelp.org,forums


In [20]:
# Read articles from file
input_fname="reditt-posts.csv"
df4=pd.read_csv(input_fname,index_col=0)
df4.index.name='post id'
df4['source']='http://www.reditt.com'
df4['category']='forums'
df4.head(1)

Unnamed: 0_level_0,title,text,href,user id,source,category
post id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Sulfur and sulfates question.,I have been subbing in a special education cla...,http://www.reddit.com/r/autism/comments/1xuenq...,0,http://www.reditt.com,forums


In [21]:
# Join datasets
print(len(df))
df=df.append(df2,ignore_index=True)
print(len(df))
df=df.append(df3,ignore_index=True)
print(len(df))
df=df.append(df4,ignore_index=True)
print(len(df))
del df2
del df3
del df4

# delete mother post id
del df['mother post id']


212
1272
2249
3776


In [22]:
df.index.name='post id'


In [23]:
# Make shorter version of reference:

def short_ref(href):
    if 'autismparentingmagazine' in href:
        return 'AutismParentingMagazine'
    elif 'medhelp' in href:
        return 'MedHelp'
    elif 'ehealthforum' in href:
        return 'eHealthForum'
    elif 'reddit' in href:
        return 'Reddit'

df['href_short'] = df['href'].apply(short_ref)
    


Unnamed: 0_level_0,category,href,source,text,title,user id,href_short
post id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,['category-applied-behavior-analysis-aba'],https://www.autismparentingmagazine.com/autism...,https://www.autismparentingmagazine.com/,For children with autism spectrum disorder (AS...,"Autism, Head Banging and other Self Harming Be...",,AutismParentingMagazine


In [29]:
df.head(300)

Unnamed: 0_level_0,category,href,source,text,title,user id,href_short,tokens,text_short
post id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,['category-applied-behavior-analysis-aba'],https://www.autismparentingmagazine.com/autism...,https://www.autismparentingmagazine.com/,For children with autism spectrum disorder (AS...,"Autism, Head Banging and other Self Harming Be...",,AutismParentingMagazine,"[autism, head, bang, and, other, self, harm, b...",For children with autism spectrum disorder (AS...
1,['category-applied-behavior-analysis-aba'],https://www.autismparentingmagazine.com/high-q...,https://www.autismparentingmagazine.com/,Dr. Stephen Shore once said “If you’ve met one...,High Quality ABA Treatment: What Every Parent...,,AutismParentingMagazine,"[high, quality, aba, treatment, what, every, p...",Dr. Stephen Shore once said “If you’ve met one...
2,['category-applied-behavior-analysis-aba'],https://www.autismparentingmagazine.com/choosi...,https://www.autismparentingmagazine.com/,Help! I am going to be starting Applied Behav...,Help: I Don’t Know How to Choose an Applied Be...,,AutismParentingMagazine,"[help, i, don, t, know, how, to, choose, an, a...",Help! I am going to be starting Applied Behav...
3,['category-applied-behavior-analysis-aba'],https://www.autismparentingmagazine.com/help-a...,https://www.autismparentingmagazine.com/,How do you handle high anxiety of a child on t...,HELP: My Autistic Child is Absolutely Terrifie...,,AutismParentingMagazine,"[help, my, autistic, child, be, absolutely, te...",How do you handle high anxiety of a child on t...
4,['category-applied-behavior-analysis-aba'],https://www.autismparentingmagazine.com/help-i...,https://www.autismparentingmagazine.com/,A grandfather from Singapore asks… My eldest g...,HELP: I Need Communication Advice for Autistic...,,AutismParentingMagazine,"[help, i, need, communication, advice, for, au...",A grandfather from Singapore asks… My eldest g...
5,['category-applied-behavior-analysis-aba'],https://www.autismparentingmagazine.com/help-m...,https://www.autismparentingmagazine.com/,"Hi, I am the parent of a 13-year-old boy with ...",Help: My Aspie Lacks Motivation,,AutismParentingMagazine,"[help, my, aspie, lack, motivation, hi, i, be,...","Hi, I am the parent of a 13-year-old boy with ..."
6,['category-autism-advocacy'],https://www.autismparentingmagazine.com/autism...,https://www.autismparentingmagazine.com/,It will be some time before we see a positive ...,Exciting Campaign Launched to Tackle the Autis...,,AutismParentingMagazine,"[excite, campaign, launch, to, tackle, the, au...",It will be some time before we see a positive ...
7,['category-autism-advocacy'],https://www.autismparentingmagazine.com/restau...,https://www.autismparentingmagazine.com/,Restaurant owner Andrew Iredale could hardly b...,How One Restaurateur Makes Eating Out Autism-F...,,AutismParentingMagazine,"[how, one, restaurateur, make, eat, out, autis...",Restaurant owner Andrew Iredale could hardly b...
8,['category-autism-advocacy'],https://www.autismparentingmagazine.com/lloyd-...,https://www.autismparentingmagazine.com/,Autism can be one of the most difficult diagno...,Lloyd Claycomb’s Views on Autism,,AutismParentingMagazine,"[lloyd, claycomb, s, view, on, autism, autism,...",Autism can be one of the most difficult diagno...
9,['category-autism-advocacy'],https://www.autismparentingmagazine.com/erasin...,https://www.autismparentingmagazine.com/,"An Interview with Dr. Tony Attwood, PhD, AS/AS...",Erasing the Prejudice – Are People on the Spec...,,AutismParentingMagazine,"[erase, the, prejudice, be, people, on, the, s...","An Interview with Dr. Tony Attwood, PhD, AS/AS..."


In [24]:
# Tokenize data
import nltk
tokenizer = nltk.RegexpTokenizer(r'\w+')

# Get list of tokens from text in first article:
text = df['text'][0].lower()
# Add also title:
title = df['title'][0].lower()
ttext = tokenizer.tokenize(title+" "+text)

In [25]:
# Get a column with list of tokens:

# 1) convert to lower case 
# 2) get tokens
# 2) save data in a new column (tokens)
#df['tokens'] = df['text'].map(lambda x: tokenizer.tokenize(x.lower()))

# Join title and text into one column
df2=df[['title','text']].apply(lambda x: ','.join(x.astype(str)),axis=1)
df['tokens'] = df2.map(lambda x: tokenizer.tokenize(x.lower()))
del df2

In [26]:
# Short version of text
df['text_short']=df['text'].apply(lambda x: x[:200]+" (...)" if (len(x) > 200) else x)

In [27]:

# Lematize

from nltk.stem.wordnet import WordNetLemmatizer

for ii in df.index:
    tokens = df.loc[ii,'tokens']
    lemas=[]
    for word in tokens:
        ww=WordNetLemmatizer().lemmatize(word,'v')
        lemas.append(ww)
    df.set_value(ii,'tokens',lemas)

#    


In [28]:
# Save dataframe with tokens into files
df.to_csv(output_fname)

In [14]:
# Get similarity matrices
documents = df['tokens'].values
print(documents[:3])
corpus,dictionary = make_corpus(documents)

#Save corpus into file
import pickle
pickle.dump(dictionary,open("dictionary.save","wb"))
pickle.dump(corpus,open("corpus.save", "wb"))

tfidf = models.TfidfModel(corpus)
tfidf.save('tfidf.save')

lsi_matsim = make_lsi_similarity_matrix(tfidf[corpus], dictionary)
lda_matsim = make_lda_similarity_matrix(corpus, dictionary)

# The models are saved into files in the above routines
# Save similarity matrices too:
pickle.dump(lsi_matsim,open("lsi-matsim.save","wb"))
pickle.dump(lda_matsim,open("lda-matsim.save","wb"))

[ ['autism', 'head', 'bang', 'and', 'other', 'self', 'harm', 'behavior', 'for', 'children', 'with', 'autism', 'spectrum', 'disorder', 'asd', 'head', 'bang', 'be', 'a', 'common', 'way', 'to', 'self', 'soothe', 'and', 'communicate', 'need', 'both', 'neurotypical', 'and', 'autistic', 'baby', 'and', 'toddlers', 'seek', 'to', 'recreate', 'the', 'rhythm', 'that', 'stimulate', 'their', 'vestibular', 'system', 'while', 'in', 'utero', 'other', 'rhythmic', 'habit', 'that', 'fuel', 'a', 'child', 's', 'kinesthetic', 'drive', 'include', 'head', 'roll', 'body', 'rock', 'bite', 'and', 'thumb']
 ['high', 'quality', 'aba', 'treatment', 'what', 'every', 'parent', 'need', 'to', 'know', 'dr', 'stephen', 'shore', 'once', 'say', 'if', 'you', 've', 'meet', 'one', 'person', 'with', 'autism', 'you', 've', 'meet', 'one', 'person', 'with', 'autism', 'as', 'many', 'of', 'you', 'know', 'the', 'centre', 'for', 'disease', 'control', 'and', 'prevention', 'report', 'that', 'there', 'be', 'now', '1', 'in', '68', 'child