In [23]:
from gensim import corpora
from gensim.models import LsiModel
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import SGDClassifier


from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem.snowball import SnowballStemmer

from matplotlib import pyplot as plt
import pandas as pd
import gensim


In [57]:
import re

# Getting the stemmer and stopwords to be removed
stemmer = SnowballStemmer("english")
stop = stopwords.words('english')
stop.append('@')
pd.set_option('display.max_colwidth', -1)


#Some Preprocessing Functions
def remove_digits(string):
    return re.sub('[^a-zA-Z]', ' ', string )

def remove_special_chars(string):
    return re.sub(r'\s+', ' ', string)

#https://www.datacamp.com/community/tutorials/discovering-hidden-topics-python
def prepare_corpus(doc_clean):
    """
    Input  : clean document
    Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
    Output : term dictionary and Document Term Matrix
    """
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    # generate LDA model
    return dictionary,doc_term_matrix

# A function to get the Latent Semantic Analysis of the words given to it, and it makes them into 3 topics
def train_and_print_topics(comment_clean):
    dictionary,doc_term_matrix=prepare_corpus(comment_clean)
    lsamodel = LsiModel(doc_term_matrix, num_topics=3, id2word = dictionary) 
    topics = lsamodel.print_topics(num_topics=3, num_words=50)
    for i in range(len(topics)):
        print("Topic",i+1,":",topics[i][1])

In [53]:
# Reading the dataset file, then Preprocessing the comments column 

df= pd.read_excel('AllchemtrailsComments.xlsx').dropna()


df['comment'] = df['comment'].apply(remove_digits)
df['comment'] = df['comment'].apply(remove_special_chars)


df['comment'] = df['comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['comment'] = df['comment'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['comment'] = df['comment'].apply(lambda x: ' '.join([word.lower() for word in x.split() ]))

comment_clean = df['comment']

comment_clean = [string.split() for string in comment_clean]

In [58]:
train_and_print_topics(comment_clean)

Topic 1 : 0.956*"u" + 0.281*"f" + 0.054*"e" + 0.038*"b" + 0.034*"c" + 0.021*"n" + 0.014*"aa" + 0.007*"ufe" + 0.003*"i" + 0.002*"fd" + 0.002*"uff" + 0.002*"fa" + 0.002*"fc" + 0.002*"fb" + 0.002*"ni" + 0.002*"fe" + 0.001*"ff" + 0.001*"ea" + 0.001*"ec" + 0.001*"ed" + 0.001*"xa" + 0.001*"eba" + 0.001*"cc" + 0.001*"ee" + 0.001*"d" + 0.001*"bb" + 0.001*"xbb" + 0.001*"xab" + 0.001*"ad" + 0.001*"bd" + 0.001*"go" + 0.001*"cf" + 0.001*"af" + 0.001*"s" + 0.001*"ab" + 0.001*"sleep" + 0.000*"tonight" + 0.000*"bore" + 0.000*"eb" + 0.000*"cd" + 0.000*"ef" + 0.000*"ba" + 0.000*"bomber" + 0.000*"scare" + 0.000*"nightmar" + 0.000*"cb" + 0.000*"de" + 0.000*"peopl" + 0.000*"dd" + 0.000*"the"
Topic 2 : -0.957*"f" + 0.280*"u" + -0.055*"aa" + 0.035*"e" + -0.026*"ufe" + 0.013*"c" + 0.011*"n" + -0.007*"fc" + -0.007*"fb" + -0.006*"fe" + -0.006*"b" + -0.006*"fa" + -0.006*"ff" + -0.006*"fd" + -0.005*"die" + -0.004*"go" + -0.004*"nim" + -0.004*"cking" + -0.004*"ec" + 0.004*"ni" + 0.003*"xa" + 0.002*"uff" + -0.002*

In [59]:
# Reading the dataset file, then Preprocessing the comments column 

df= pd.read_excel('All911Comments.xlsx').dropna()

df['comment'] = df['comment'].apply(remove_digits)
# df['comment'] = df['comment'].apply(remove_special_chars)

df['comment'] = df['comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['comment'] = df['comment'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['comment'] = df['comment'].apply(lambda x: ' '.join([word.lower() for word in x.split() ]))

comment_clean = df['comment']

comment_clean = [string.split() for string in comment_clean]


In [60]:
train_and_print_topics(comment_clean)

Topic 1 : 0.956*"u" + 0.281*"f" + 0.054*"e" + 0.038*"b" + 0.034*"c" + 0.021*"n" + 0.014*"aa" + 0.007*"ufe" + 0.003*"i" + 0.002*"fd" + 0.002*"uff" + 0.002*"fa" + 0.002*"fc" + 0.002*"fb" + 0.002*"ni" + 0.002*"fe" + 0.001*"ff" + 0.001*"ea" + 0.001*"ec" + 0.001*"ed" + 0.001*"xa" + 0.001*"eba" + 0.001*"cc" + 0.001*"ee" + 0.001*"d" + 0.001*"bb" + 0.001*"xbb" + 0.001*"xab" + 0.001*"ad" + 0.001*"bd" + 0.001*"go" + 0.001*"cf" + 0.001*"af" + 0.001*"s" + 0.001*"ab" + 0.001*"sleep" + 0.000*"tonight" + 0.000*"bore" + 0.000*"eb" + 0.000*"cd" + 0.000*"ef" + 0.000*"ba" + 0.000*"bomber" + 0.000*"scare" + 0.000*"nightmar" + 0.000*"cb" + 0.000*"de" + 0.000*"peopl" + 0.000*"dd" + 0.000*"the"
Topic 2 : -0.957*"f" + 0.280*"u" + -0.055*"aa" + 0.035*"e" + -0.026*"ufe" + 0.013*"c" + 0.011*"n" + -0.007*"fc" + -0.007*"fb" + -0.006*"fe" + -0.006*"b" + -0.006*"fa" + -0.006*"ff" + -0.006*"fd" + -0.005*"die" + -0.004*"go" + -0.004*"nim" + -0.004*"cking" + -0.004*"ec" + 0.004*"ni" + 0.003*"xa" + 0.002*"uff" + -0.002*

In [61]:
# Reading the dataset file, then Preprocessing the comments column 

df= pd.read_excel('AllFlatEarthComments.xlsx').dropna()

df['comment'] = df['comment'].apply(remove_digits)
df['comment'] = df['comment'].apply(remove_special_chars)

df['comment'] = df['comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['comment'] = df['comment'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['comment'] = df['comment'].apply(lambda x: ' '.join([word.lower() for word in x.split() ]))

comment_clean = df['comment']

comment_clean = [string.split() for string in comment_clean]


In [62]:
train_and_print_topics(comment_clean)

Topic 1 : 0.998*"u" + 0.041*"f" + 0.036*"e" + 0.026*"c" + 0.014*"n" + 0.012*"b" + 0.008*"U" + 0.004*"xa" + 0.003*"cc" + 0.002*"r" + 0.001*"ufe" + 0.001*"ba" + 0.001*"fb" + 0.001*"da" + 0.000*"take" + 0.000*"I" + 0.000*"af" + 0.000*"Bob" + 0.000*"earth" + 0.000*"g" + 0.000*"This" + 0.000*"uff" + 0.000*"Copy" + 0.000*"Google" + 0.000*"D" + 0.000*"flat" + 0.000*"Wiggle" + 0.000*"Youtube" + 0.000*"de" + 0.000*"thing" + 0.000*"many" + 0.000*"nI" + 0.000*"youtube" + 0.000*"df" + 0.000*"comment" + 0.000*"dd" + 0.000*"paste" + 0.000*"agree" + 0.000*"problems" + 0.000*"bob" + 0.000*"sections" + 0.000*"fs" + 0.000*"ufd" + 0.000*"aa" + 0.000*"w" + 0.000*"youtu" + 0.000*"nhttps" + 0.000*"Earth" + 0.000*"A" + 0.000*"ac"
Topic 2 : 0.744*"f" + 0.662*"U" + 0.080*"e" + -0.040*"u" + 0.016*"c" + 0.010*"b" + 0.008*"ufe" + 0.007*"fb" + 0.004*"fc" + 0.002*"n" + 0.001*"I" + 0.001*"earth" + 0.001*"flat" + 0.000*"The" + 0.000*"ec" + 0.000*"Earth" + -0.000*"Wiggle" + 0.000*"ff" + 0.000*"fa" + 0.000*"ea" + 0.000

In [63]:
# Reading the dataset file, then Preprocessing the comments column 

df= pd.read_excel('AllMoonLandingComments.xlsx').dropna()


df['comment'] = df['comment'].apply(remove_digits)
df['comment'] = df['comment'].apply(remove_special_chars)


df['comment'] = df['comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['comment'] = df['comment'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['comment'] = df['comment'].apply(lambda x: ' '.join([word.lower() for word in x.split() ]))
comment_clean = df['comment']

comment_clean = [string.split() for string in comment_clean]


In [64]:
train_and_print_topics(comment_clean)

Topic 1 : 0.993*"u" + 0.076*"f" + 0.066*"e" + 0.043*"c" + 0.043*"b" + 0.029*"n" + 0.006*"r" + 0.006*"fb" + 0.004*"uff" + 0.003*"xb" + 0.003*"U" + 0.002*"ago" + 0.002*"nReply" + 0.001*"I" + 0.001*"bd" + 0.001*"eb" + 0.001*"days" + 0.001*"fc" + 0.001*"cf" + 0.001*"moon" + 0.001*"ca" + 0.001*"eba" + 0.001*"af" + 0.001*"youtube" + 0.001*"dog" + 0.001*"fd" + 0.001*"views" + 0.001*"ed" + 0.001*"day" + 0.001*"The" + 0.001*"db" + 0.001*"de" + 0.001*"nMinecraft" + 0.001*"bf" + 0.001*"dd" + 0.001*"fa" + 0.001*"nSSundee" + 0.001*"ae" + 0.001*"NASA" + 0.001*"every" + 0.001*"nhttps" + 0.001*"A" + 0.001*"ab" + 0.001*"www" + 0.001*"ea" + 0.001*"cd" + 0.000*"E" + 0.000*"replies" + 0.000*"nView" + 0.000*"bc"
Topic 2 : -0.748*"U" + -0.656*"f" + 0.055*"u" + -0.034*"fd" + -0.029*"c" + -0.024*"fa" + -0.023*"moon" + -0.022*"e" + -0.020*"I" + -0.018*"b" + 0.017*"n" + -0.013*"ufe" + 0.009*"youtube" + -0.008*"The" + -0.008*"would" + 0.008*"www" + -0.008*"r" + 0.008*"com" + 0.008*"v" + 0.007*"watch" + -0.007*"l

In [65]:
# Reading the dataset file, then Preprocessing the comments column 

df= pd.read_excel('AllVaccinesComments.xlsx').dropna()

df['comment'] = df['comment'].apply(remove_digits)
df['comment'] = df['comment'].apply(remove_special_chars)


df['comment'] = df['comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['comment'] = df['comment'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df['comment'] = df['comment'].apply(lambda x: ' '.join([word.lower() for word in x.split() ]))
comment_clean = df['comment']

comment_clean = [string.split() for string in comment_clean]


In [66]:
train_and_print_topics(comment_clean)

Topic 1 : 0.704*"f" + 0.695*"U" + 0.138*"u" + 0.038*"n" + 0.024*"c" + 0.015*"b" + 0.011*"e" + 0.005*"I" + 0.002*"ufe" + 0.002*"vaccines" + 0.002*"r" + 0.002*"bf" + 0.002*"vaccine" + 0.002*"people" + 0.001*"The" + 0.001*"fb" + 0.001*"bd" + 0.001*"get" + 0.001*"ab" + 0.001*"children" + 0.001*"one" + 0.001*"www" + 0.001*"like" + 0.001*"vaccinated" + 0.001*"autism" + 0.001*"bc" + 0.001*"know" + 0.001*"would" + 0.001*"af" + 0.001*"nhttps" + 0.001*"child" + 0.001*"ac" + 0.001*"anti" + 0.001*"It" + 0.001*"com" + 0.001*"even" + 0.001*"ba" + 0.001*"fc" + 0.001*"ae" + 0.001*"ad" + 0.001*"bb" + 0.001*"Dr" + 0.001*"kids" + 0.001*"measles" + 0.000*"ee" + 0.000*"disease" + 0.000*"ec" + 0.000*"cause" + 0.000*"fe" + 0.000*"think"
Topic 2 : 0.981*"u" + -0.115*"U" + -0.087*"f" + 0.080*"c" + 0.076*"b" + 0.054*"n" + 0.026*"e" + 0.021*"I" + 0.014*"bf" + 0.011*"r" + 0.009*"vaccines" + 0.009*"vaccine" + 0.008*"bd" + 0.006*"people" + 0.005*"bc" + 0.005*"The" + 0.005*"children" + 0.005*"ac" + 0.005*"get" + 0.0