In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Project/csv_files/updated_cleaned_data.csv')

df.head(5)

Unnamed: 0,document,summary,id,rg_labels,ext_labels
0,hi i I am getting a new laptop i do nt know wh...,of my post is asking what can i do compensate ...,TLDR_RS_2021-04-cm-17760.json,[1.0],[1]
1,it 's possible that many of you already do thi...,if you play hearthstone on mac and it runs poo...,TLDR_RS_2021-02-cm-3714.json,"[0.16403576510432702, 0.35967370810124305, 0.2...","[0, 1, 0, 0]"
2,"i will be the first to admit , i 've got a wei...",i 'm tired of my boyfriend bringing up my weig...,TLDR_RS_2021-04-cm-1732.json,"[0.1865050509103, 0.389592934910708, 0.2526842...","[0, 1, 0, 0, 0]"
3,"hello , my friend 22m broke up a few months ag...",: my friend has suddenly stopped talking to me...,TLDR_RS_2021-03-cm-34926.json,"[0.13981801700034802, 0.054837487497308005, 0....","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
4,background : my 18f ex boyfriend 18m really wa...,my ex is very erratic and depressed and never ...,TLDR_RS_2021-02-cm-18586.json,"[0.18652821177119003, 0.149288690725626, 0.038...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"


In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Download the VADER lexicon
nltk.download('vader_lexicon')

# Initialize VADER SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Function to calculate sentiment scores
def vader_sentiment(text):
    return sia.polarity_scores(text)

# Fill NaN values in columns with empty strings
df['document'] = df['document'].fillna('')
df['summary'] = df['summary'].fillna('')

# Apply VADER sentiment analysis
df['document_sentiment'] = df['document'].apply(lambda x: vader_sentiment(x)['compound'])
df['summary_sentiment'] = df['summary'].apply(lambda x: vader_sentiment(x)['compound'])

# Display the DataFrame with the sentiment scores
df[['document', 'document_sentiment', 'summary', 'summary_sentiment']].head(5)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,document,document_sentiment,summary,summary_sentiment
0,hi i I am getting a new laptop i do nt know wh...,0.9169,of my post is asking what can i do compensate ...,0.8176
1,it 's possible that many of you already do thi...,0.9418,if you play hearthstone on mac and it runs poo...,0.5859
2,"i will be the first to admit , i 've got a wei...",0.5734,i 'm tired of my boyfriend bringing up my weig...,-0.4404
3,"hello , my friend 22m broke up a few months ag...",-0.7086,: my friend has suddenly stopped talking to me...,0.0313
4,background : my 18f ex boyfriend 18m really wa...,-0.9274,my ex is very erratic and depressed and never ...,-0.6969


In [None]:
def interpret_sentiment(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

df['document_sentiment'] = df['document_sentiment'].apply(interpret_sentiment)
df['summary_sentiment'] = df['summary_sentiment'].apply(interpret_sentiment)

df[['document', 'document_sentiment', 'summary', 'summary_sentiment']].head(5)

Unnamed: 0,document,document_sentiment,summary,summary_sentiment
0,hi i I am getting a new laptop i do nt know wh...,Positive,of my post is asking what can i do compensate ...,Positive
1,it 's possible that many of you already do thi...,Positive,if you play hearthstone on mac and it runs poo...,Positive
2,"i will be the first to admit , i 've got a wei...",Positive,i 'm tired of my boyfriend bringing up my weig...,Negative
3,"hello , my friend 22m broke up a few months ag...",Negative,: my friend has suddenly stopped talking to me...,Neutral
4,background : my 18f ex boyfriend 18m really wa...,Negative,my ex is very erratic and depressed and never ...,Negative


In [None]:
from gensim import corpora
from gensim.models import LdaModel
import pandas as pd

# Ensure each entry in 'document' and 'summary' is a list of tokens
df['document_token'] = df['document'].apply(lambda x: x.split() if isinstance(x, str) else [])
df['summary_token'] = df['summary'].apply(lambda x: x.split() if isinstance(x, str) else [])

# Create a dictionary and corpus for the documents
dictionary_docs = corpora.Dictionary(df['document_token'])
corpus_docs = [dictionary_docs.doc2bow(text) for text in df['document_token'] if text]

# Create and fit the LDA model for the documents
lda_model_docs = LdaModel(corpus=corpus_docs, id2word=dictionary_docs, num_topics=10, random_state=42)

# Create a mapping of topic numbers to topic words
topic_words_docs = {i: ' + '.join([f'"{word}"' for word, _ in lda_model_docs.show_topic(i, topn=5)]) for i in range(lda_model_docs.num_topics)}

# Add topic words to the DataFrame instead of topic numbers
df['document_topics'] = [max(lda_model_docs[doc], key=lambda x: x[1])[0] if doc else -1 for doc in corpus_docs]
df['document_topics'] = df['document_topics'].map(topic_words_docs)

# Create a dictionary and corpus for the summaries
dictionary_summaries = corpora.Dictionary(df['summary_token'])
corpus_summaries = [dictionary_summaries.doc2bow(text) for text in df['summary_token'] if text]

# Create and fit the LDA model for the summaries
lda_model_summaries = LdaModel(corpus=corpus_summaries, id2word=dictionary_summaries, num_topics=10, random_state=42)

# Create a mapping of topic numbers to topic words for summaries
topic_words_summaries = {i: ' + '.join([f'"{word}"' for word, _ in lda_model_summaries.show_topic(i, topn=5)]) for i in range(lda_model_summaries.num_topics)}

# Initialize summary topics with NaN to ensure matching length
df['summary_topics'] = pd.NA

# Populate summary topics directly into the DataFrame
for index, row in df.iterrows():
    if row['summary_token']:  # Check if summary_token is not empty
        doc = dictionary_summaries.doc2bow(row['summary_token'])
        if doc:  # Ensure doc is not empty
            topic = max(lda_model_summaries[doc], key=lambda x: x[1])[0]
            df.at[index, 'summary_topics'] = topic_words_summaries[topic]
    else:
        df.at[index, 'summary_topics'] = 'No Topic'  # Placeholder for empty summaries

# Optionally fill remaining NaN values with a placeholder
df['summary_topics'].fillna('No Topic', inplace=True)

In [None]:
df[['document_token', 'document_topics', 'summary_token', 'summary_topics']].head(5)

Unnamed: 0,document_token,document_topics,summary_token,summary_topics
0,"[hi, i, I, am, getting, a, new, laptop, i, do,...","""i"" + ""."" + ""the"" + ""to"" + "",""","[of, my, post, is, asking, what, can, i, do, c...",""","" + ""to"" + ""."" + ""a"" + ""you"""
1,"[it, 's, possible, that, many, of, you, alread...","""i"" + ""."" + ""the"" + ""to"" + "",""","[if, you, play, hearthstone, on, mac, and, it,...","""."" + ""the"" + "":"" + ""i"" + ""to"""
2,"[i, will, be, the, first, to, admit, ,, i, 've...","""he"" + ""him"" + ""."" + ""his"" + ""to""","[i, 'm, tired, of, my, boyfriend, bringing, up...","""i"" + ""."" + ""to"" + ""and"" + ""my"""
3,"[hello, ,, my, friend, 22m, broke, up, a, few,...","""i"" + ""."" + ""and"" + "","" + ""to""","[:, my, friend, has, suddenly, stopped, talkin...","""i"" + ""."" + ""to"" + ""and"" + ""my"""
4,"[background, :, my, 18f, ex, boyfriend, 18m, r...","""i"" + ""."" + ""and"" + "","" + ""to""","[my, ex, is, very, erratic, and, depressed, an...","""i"" + ""."" + ""to"" + ""and"" + ""my"""


In [None]:
df.head(5)

Unnamed: 0,document,summary,id,rg_labels,ext_labels,document_sentiment,summary_sentiment,document_token,summary_token,document_topics,summary_topics
0,hi i I am getting a new laptop i do nt know wh...,of my post is asking what can i do compensate ...,TLDR_RS_2021-04-cm-17760.json,[1.0],[1],Positive,Positive,"[hi, i, I, am, getting, a, new, laptop, i, do,...","[of, my, post, is, asking, what, can, i, do, c...","""i"" + ""."" + ""the"" + ""to"" + "",""",""","" + ""to"" + ""."" + ""a"" + ""you"""
1,it 's possible that many of you already do thi...,if you play hearthstone on mac and it runs poo...,TLDR_RS_2021-02-cm-3714.json,"[0.16403576510432702, 0.35967370810124305, 0.2...","[0, 1, 0, 0]",Positive,Positive,"[it, 's, possible, that, many, of, you, alread...","[if, you, play, hearthstone, on, mac, and, it,...","""i"" + ""."" + ""the"" + ""to"" + "",""","""."" + ""the"" + "":"" + ""i"" + ""to"""
2,"i will be the first to admit , i 've got a wei...",i 'm tired of my boyfriend bringing up my weig...,TLDR_RS_2021-04-cm-1732.json,"[0.1865050509103, 0.389592934910708, 0.2526842...","[0, 1, 0, 0, 0]",Positive,Negative,"[i, will, be, the, first, to, admit, ,, i, 've...","[i, 'm, tired, of, my, boyfriend, bringing, up...","""he"" + ""him"" + ""."" + ""his"" + ""to""","""i"" + ""."" + ""to"" + ""and"" + ""my"""
3,"hello , my friend 22m broke up a few months ag...",: my friend has suddenly stopped talking to me...,TLDR_RS_2021-03-cm-34926.json,"[0.13981801700034802, 0.054837487497308005, 0....","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",Negative,Neutral,"[hello, ,, my, friend, 22m, broke, up, a, few,...","[:, my, friend, has, suddenly, stopped, talkin...","""i"" + ""."" + ""and"" + "","" + ""to""","""i"" + ""."" + ""to"" + ""and"" + ""my"""
4,background : my 18f ex boyfriend 18m really wa...,my ex is very erratic and depressed and never ...,TLDR_RS_2021-02-cm-18586.json,"[0.18652821177119003, 0.149288690725626, 0.038...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",Negative,Negative,"[background, :, my, 18f, ex, boyfriend, 18m, r...","[my, ex, is, very, erratic, and, depressed, an...","""i"" + ""."" + ""and"" + "","" + ""to""","""i"" + ""."" + ""to"" + ""and"" + ""my"""


In [None]:
df.to_csv('/content/drive/MyDrive/Project/csv_files/updated_text_analysis.csv', index=False)