## Task-2

### Part 2.1

We are going to stick with our Business Actors vs Private Individuals (English speaking) conflict

In [1]:
from text_mining.data import get_research_dataframe

from sdm.config import get_db_connection

In [3]:
conn = get_db_connection(db_path="D:/UU/Sem3/SDM/social_data_mining/data/twitter.db")
df = get_research_dataframe(db=conn)
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26019 entries, 0 to 26018
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   account_id    26019 non-null  object
 1   tweet_text    26019 non-null  object
 2   account_type  26019 non-null  object
 3   lang          26019 non-null  object
 4   stance        25480 non-null  object
dtypes: object(5)
memory usage: 1016.5+ KB
None


Unnamed: 0,account_id,tweet_text,account_type,lang,stance
0,77254498,"""@ComposerCorner I am proud to be a #climatech...",Private individuals,en,For
1,77254498,#ActionDay ~ Blog ~ Greener Green Gifts ~ htt...,Private individuals,en,For
2,77254498,#Agnotology ~ @Ecojustice files complaint w/Co...,Private individuals,en,For
3,77254498,#AsPredicted ~ @Canada wins #FossiloftheDay ~ ...,Private individuals,en,For
4,77254498,#AusPol ~ #COP21 ~ #MentalHealth of #Australia...,Private individuals,en,For


### Part 2.2: Preprocessing

In [4]:
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [5]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'https://t\.co/\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    # words = text.split()  # Tokenize
    # words = [w for w in words if w not in ENGLISH_STOP_WORDS and len(w) > 2 and 'http' not in w]  # Remove stop words
    return text

In [6]:
def tokenize_text(text):
    return word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = set(stopwords.words("english"))
    return [word for word in tokens if word not in stop_words]

def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

In [7]:
df["processed_text"] = df["tweet_text"].apply(preprocess_text)
df["tokens"] = df["processed_text"].apply(tokenize_text)
df["tokens"] = df["tokens"].apply(remove_stopwords)
df["tokens"] = df["tokens"].apply(lemmatize_tokens)

df.head()

Unnamed: 0,account_id,tweet_text,account_type,lang,stance,processed_text,tokens
0,77254498,"""@ComposerCorner I am proud to be a #climatech...",Private individuals,en,For,composercorner i am proud to be a climatechang...,"[composercorner, proud, climatechange, denier,..."
1,77254498,#ActionDay ~ Blog ~ Greener Green Gifts ~ htt...,Private individuals,en,For,actionday blog greener green gifts law cu...,"[actionday, blog, greener, green, gift, law, c..."
2,77254498,#Agnotology ~ @Ecojustice files complaint w/Co...,Private individuals,en,For,agnotology ecojustice files complaint wcompet...,"[agnotology, ecojustice, file, complaint, wcom..."
3,77254498,#AsPredicted ~ @Canada wins #FossiloftheDay ~ ...,Private individuals,en,For,aspredicted canada wins fossiloftheday it ge...,"[aspredicted, canada, win, fossiloftheday, get..."
4,77254498,#AusPol ~ #COP21 ~ #MentalHealth of #Australia...,Private individuals,en,For,auspol cop mentalhealth of australian wheat ...,"[auspol, cop, mentalhealth, australian, wheat,..."


### Part 2.3: LDA, one more time...

In [2]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel

In [11]:
dictionary = Dictionary(df["tokens"])
# dictionary.filter_extremes(no_below=2, no_above=0.8)
print(len(dictionary))

25619


In [15]:
corpus = [dictionary.doc2bow(tokens) for tokens in df["tokens"]]
# print(corpus[:10])

In [16]:
lda_model = LdaModel(
                corpus=corpus,
                id2word=dictionary,
                random_state=42,
                passes=10,
                iterations=50
            )

In [17]:
print(lda_model.print_topics())

[(83, '0.217*"lpaa" + 0.128*"build" + 0.104*"washington" + 0.103*"write" + 0.102*"sink" + 0.081*"rt" + 0.069*"lpaa_live" + 0.038*"cop" + 0.015*"damage" + 0.010*"thought"'), (81, '0.475*"un" + 0.120*"earthtoparis" + 0.072*"impact" + 0.055*"cop" + 0.042*"air" + 0.033*"climatechange" + 0.026*"cleanenergy" + 0.024*"public" + 0.014*"transformation" + 0.010*"study"'), (58, '0.277*"du" + 0.097*"launch" + 0.060*"reach" + 0.058*"htt" + 0.057*"speech" + 0.045*"helping" + 0.044*"research" + 0.032*"speaks" + 0.028*"idea" + 0.016*"took"'), (62, '0.232*"take" + 0.179*"last" + 0.125*"city" + 0.074*"amp" + 0.065*"low" + 0.048*"speaking" + 0.044*"better" + 0.027*"cop" + 0.022*"everything" + 0.019*"place"'), (93, '0.125*"initiative" + 0.111*"cop" + 0.110*"sector" + 0.095*"agenda" + 0.070*"private" + 0.054*"side" + 0.050*"event" + 0.038*"forward" + 0.036*"resource" + 0.031*"adp"'), (29, '0.249*"planet" + 0.156*"business" + 0.098*"transition" + 0.080*"cop" + 0.071*"adapt" + 0.061*"help" + 0.045*"rt" + 0.0

In [18]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [19]:
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)

pyLDAvis.display(lda_vis)

I want to test the above with a different preprocessing function, so LDA one more time...

In [23]:
def aliter_preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'https://t\.co/\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    words = text.split()  # Tokenize
    words = [w for w in words if w not in ENGLISH_STOP_WORDS and len(w) > 2 and 'http' not in w]  # Remove stop words
    return words

In [24]:
test_df = get_research_dataframe(db=conn)

test_df["processed_text"] = test_df["tweet_text"].apply(aliter_preprocess_text)
test_df.head()

Unnamed: 0,account_id,tweet_text,account_type,lang,stance,processed_text
0,77254498,"""@ComposerCorner I am proud to be a #climatech...",Private individuals,en,For,"[composercorner, proud, climatechange, denier,..."
1,77254498,#ActionDay ~ Blog ~ Greener Green Gifts ~ htt...,Private individuals,en,For,"[actionday, blog, greener, green, gifts, law, ..."
2,77254498,#Agnotology ~ @Ecojustice files complaint w/Co...,Private individuals,en,For,"[agnotology, ecojustice, files, complaint, wco..."
3,77254498,#AsPredicted ~ @Canada wins #FossiloftheDay ~ ...,Private individuals,en,For,"[aspredicted, canada, wins, fossiloftheday, ge..."
4,77254498,#AusPol ~ #COP21 ~ #MentalHealth of #Australia...,Private individuals,en,For,"[auspol, cop, mentalhealth, australian, wheat,..."


In [29]:
test_dictionary = Dictionary(test_df["processed_text"])
test_dictionary.filter_extremes(no_below=2, no_above=0.8)
print(len(test_dictionary))

12256


In [30]:
test_corpus = [test_dictionary.doc2bow(text) for text in test_df["processed_text"]]

In [34]:
test_lda_model = LdaModel(
                corpus=test_corpus,
                id2word=test_dictionary,
                random_state=42,
                passes=10,
                iterations=50,
                num_topics=50
            )

In [35]:
test_lda_vis = gensimvis.prepare(test_lda_model, test_corpus, test_dictionary)

pyLDAvis.display(test_lda_vis)