In [2]:
import pandas as pd
data=pd.read_csv('Data.csv')
data.drop(columns=['label'], inplace=True)
print(data)
print(data.shape)

                                                Reviews
0     Okay, so I'm not a big video game buff, but wa...
1     The premise of this movie has been tickling my...
2     Jim Carrey is back to much the same role that ...
3     I read several mixed reviews and several of th...
...                                                 ...
7495  It was great to see some of my favorite stars ...
7496  2 stars for Kay Francis -- she's wonderful! An...
7497  I really wish i could give this a negative vot...
7498  I love B movies..but come on....this wasn't ev...
7499  As I said in my comment about the first part: ...

[7500 rows x 1 columns]
(7500, 1)


In [3]:
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
import contractions
import swifter

def tosentences(article):
    sentences=article.split(". ")
    sentences=[sentence.strip() for sentence in sentences]
    return sentences

In [4]:
def preprocess(sentences, min_word_length=2):
    preprocessed_sentences = []
    for sentence in sentences:
        soup = BeautifulSoup(sentence, 'html.parser')
        sentence = soup.get_text()
        sentence = contractions.fix(sentence)
        words = word_tokenize(sentence)
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word.lower() not in stop_words]
        words = [re.sub(r'\d+', '', word) for word in words]  
        words = [re.sub(r'\d', '', word) for word in words]   
        words = [re.sub(r'(.)\1+', r'\1\1', word) for word in words] 
        words = [re.sub(r'\{.*?\}', '', word) for word in words] 
        words = [re.sub(r'^[\'\s]*|[^\w\s\'-]', '', word) for word in words] 
        words = [re.sub(r'\.{3,}', '', word) for word in words]  
        words = [re.sub(r'\.{4,}', '', word) for word in words]  
        words = [re.sub(r'\.( +)', '.', word) for word in words] 
        words = [word for word in words if len(word) >= min_word_length and not word.isnumeric() and word not in string.punctuation]
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
        preprocessed_sentence = ' '.join(words)
        preprocessed_sentences.append(preprocessed_sentence)
    
    return preprocessed_sentences

In [5]:
def wordtokenize(sentences):
    tokenized_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        tokenized_sentences.append(words)
    return tokenized_sentences

In [6]:
from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel
from nltk.tokenize import word_tokenize
import swifter
import pandas as pd

def compute_coherence(lda_model, tokenized_sentences, dictionary, corpus):
    coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_sentences, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_lda.get_coherence()
    return coherence_score

def train(row):
    print(row.name,end=' ')
    tokenized_sentences = [word_tokenize(sentence) for sentence in row['processed']]
    dictionary = corpora.Dictionary(tokenized_sentences)
    corpus = [dictionary.doc2bow(doc) for doc in tokenized_sentences]
    
    lda_model = models.LdaModel(
    corpus=corpus,
    num_topics=8,
    id2word=dictionary,
    passes=10,
    alpha=0.108714,
    eta=0.295088,
    decay=0.602035,
    offset=9.907192,
    random_state=77,
    minimum_probability=0.085315)
    
    coherence_score = compute_coherence(lda_model, tokenized_sentences, dictionary, corpus)
    
    sentence_topic_distributions = []
    for doc in corpus:
        sentence_topic_distribution = lda_model.get_document_topics(doc)
        sentence_topic_distributions.append(dict(sentence_topic_distribution))
        
    topic_words_list = []
    for idx, topic in lda_model.print_topics(-1):
        topic_words = [word.split('"')[1] for word in topic.split()[1:] if '"' in word]
        topic_words_list.append(topic_words)
    
    row['topic_words'] = topic_words_list
    row['sentence_topic_distributions'] = sentence_topic_distributions
    row['coherence_score'] = coherence_score
    return row

In [7]:
import ast
import swifter

def filter_topic_distribution(distribution_list):
    theta = 0.1
    t1_list = []
 
    for distribution in distribution_list:
        filtered_distribution = [{k: v} for k, v in distribution.items() if v > theta]
        t1_list.extend(filtered_distribution)
    return t1_list

In [8]:
def get_dominant_topics(sentence_topic_distribution):
    dominant_topics = []
    for distribution in sentence_topic_distribution:
        dominant_topic = max(distribution, key=distribution.get)
        dominant_topics.append(dominant_topic)
    return dominant_topics

In [9]:
def get_topic_words(row):
    index_values = row['ST']
    topic_words = row['topic_words']
    result = []
    for index in index_values:
        if 0 <= index < len(topic_words):
            result.append(topic_words[index])
        else:
            result.append(None) 
    return result

In [10]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from senticnet.senticnet import SenticNet
sid = SentimentIntensityAnalyzer()
sn = SenticNet()


def sentic(word):
     score=0
     if word in sn.data:
        sentiment_info = sn.concept(word)
        score = float(sentiment_info.get('polarity_value', 0))
     return score

def TEXTBLOB(word):  
    sentiment_score=TextBlob(word).sentiment.polarity
    return sentiment_score

def SA(word):
     sentiment_score = sid.polarity_scores(word)['compound'] 
     return sentiment_score
  
def match_and_calculate_sentiment(sentences, topic_words):
    sentiment_scores = []
    
    for sentence, words in zip(sentences, topic_words):
        sentiment_score=0
        for word in nltk.word_tokenize(sentence):
            if word in words:
                sentiment_score += (TEXTBLOB(word))
        sentiment_scores.append(sentiment_score)
        
    return sentiment_scores

In [11]:
def calculate_reaching_definition(df,r):
    definitions_generated = []
    definitions_killed = []
    
    for index, row in df.iterrows():
        row_definitions_generated = [set() for _ in range(len(row['processed']))]
        row_definitions_killed = [set() for _ in range(len(row['processed']))]
        
        for i, (sentence, topic_idx, sentiment_score) in enumerate(zip(row['processed'], row['ST'], row[r])):
            definition_generated = (topic_idx, sentiment_score)
            for j in range(i):
                if (row['ST'][j], row[r][j]) == definition_generated:
                    row_definitions_killed[i].add(j)
            row_definitions_generated[i].add(definition_generated)
            for j in range(i):
                if j not in row_definitions_killed[i]:
                    row_definitions_generated[i] |= row_definitions_generated[j]
        
        definitions_generated.append(row_definitions_generated)
        definitions_killed.append(row_definitions_killed)
    
    df['definitions_generated'] = definitions_generated
    df['definitions_killed'] = definitions_killed
    
    return df

In [12]:
def calculate_IN_OUT_sets(definitions_generated_col, definitions_killed_col):
    IN_sets = []
    OUT_sets = []
    for gen, killed in zip(definitions_generated_col, definitions_killed_col):
        OUT = [set() for _ in range(len(gen))]
        changed = True
        while changed:
            changed = False
            for i in range(len(gen)):
                IN = set().union(*[OUT[p] for p in range(len(gen)) if i not in killed[p]])
                new_OUT = gen[i].union(IN - killed[i])
                if new_OUT != OUT[i]:
                    OUT[i] = new_OUT
                    changed = True
        IN = [set().union(*[OUT[p] for p in range(len(gen)) if i in gen[p]]) for i in range(len(gen))]
        IN_sets.append(IN)
        OUT_sets.append(OUT)
    return IN_sets, OUT_sets

In [13]:
def generate_summary(sentences, S, sentiment_scores):
    summaries = []
    sentiment_scores_summary = []

    for doc_sentences, out, doc_sentiment_scores in zip(sentences, S, sentiment_scores):
        summary = ""
        total_weighted_sum = 0
        total_weight = 0
        added_indices = set()

        for out_set in out:
            for j in out_set:
                if isinstance(j, tuple):
                    j = j[0]
                j = int(j)  # Convert to integer
                if 0 <= j < len(doc_sentences) and j < len(doc_sentiment_scores):
                    sentiment_score = float(doc_sentiment_scores[j])
                    position = j
                    weight = position
                    weighted_sum = weight * sentiment_score
                    total_weighted_sum += weighted_sum
                    total_weight += weight
                    if j not in added_indices:
                        summary += doc_sentences[j] + " "
                        added_indices.add(j)

        if total_weight != 0:
            sentiment = total_weighted_sum / total_weight
        else:
            sentiment = 0
        
        summaries.append(summary.strip())
        sentiment_scores_summary.append(sentiment)

    return summaries, sentiment_scores_summary

In [14]:
from bs4 import BeautifulSoup
import contractions
import re
def preprocess_text(text):
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    text = contractions.fix(text)
    text = re.sub(r'\\', '', text)
    return text

In [15]:
def classify_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    if polarity > 0:
        return 'postive'
    elif polarity < 0:
        return 'negative'
    else:
        return 'neutral'

In [16]:
data['sentence']=data['Reviews'].swifter.apply(tosentences)
data['processed'] = data['sentence'].swifter.apply(preprocess)

Pandas Apply:   0%|          | 0/7500 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/7500 [00:00<?, ?it/s]

  soup = BeautifulSoup(sentence, 'html.parser')


In [17]:
df=data.iloc[:2500]

In [18]:
import time
from datetime import datetime
start_time = time.time()
Iter=df.apply(train,axis=1)
end_time = time.time()
execution_time_seconds=end_time - start_time
execution_time_formatted = execution_time_seconds/60
print("Execution Time:", execution_time_formatted)

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 27

In [19]:
Iter

Unnamed: 0,Reviews,sentence,processed,topic_words,sentence_topic_distributions,coherence_score
0,"Okay, so I'm not a big video game buff, but wa...","[Okay, so I'm not a big video game buff, but w...",[Okay big video game buff game House Dead real...,"[[idea, one, movie, least, case, wood, would, ...","[{4: 0.9900984}, {2: 0.9716734}, {0: 0.9228832...",0.512954
1,The premise of this movie has been tickling my...,[The premise of this movie has been tickling m...,[premise movie tickling imagination quite time...,"[[like, place, good, Dave, film, two, look, mo...","[{4: 0.8892084}, {5: 0.8437177}, {6: 0.9705783...",0.448367
2,Jim Carrey is back to much the same role that ...,[Jim Carrey is back to much the same role that...,[Jim Carrey back much role played Mask timid g...,"[[handful, news, hard, predict, hoodlum, okay,...","[{1: 0.95740783}, {5: 0.9548809}, {0: 0.940859...",0.312442
3,I read several mixed reviews and several of th...,[I read several mixed reviews and several of t...,[read several mixed review several downright t...,"[[film, movie, part, plenty, lacking, predicta...","[{4: 0.91418356}, {7: 0.9488169}, {1: 0.966719...",0.451127
4,Warning: Does contain spoilers.<br /><br />Ope...,[Warning: Does contain spoilers.<br /><br />Op...,[Warning contain spoilersOpen EyesIf seen film...,"[[character, one, like, day, modern, Russel, l...","[{1: 0.9358747}, {1: 0.3848357, 6: 0.38782603}...",0.461053
...,...,...,...,...,...,...
2495,I thought the whole movie played out beautiful...,[I thought the whole movie played out beautifu...,[thought whole movie played beautifully fresh ...,"[[everyone, pick, Running, story, Messiah, nea...","[{1: 0.93588144}, {2: 0.9228906}, {4: 0.803320...",0.555756
2496,"""Seed"" is torture porn...no doubt about it. Bu...","[""Seed"" is torture porn...no doubt about it, B...","[Seed torture porn doubt, strangely Uwe Boll w...","[[Boll, pretty, even, improve, Perhaps, disgus...","[{1: 0.84371084}, {5: 0.9651998}, {5: 0.870341...",0.591447
2497,The title alone (along with the poster) is eno...,[The title alone (along with the poster) is en...,[title alone along poster enough give away Pro...,"[[parade, secretary, small, forced, another, s...","[{3: 0.9408637}, {6: 0.9520408}, {3: 0.9813775...",0.441880
2498,This is one of the best animated family films ...,[This is one of the best animated family films...,"[one best animated family film time, Moreover ...","[[cinematic, realistic, composed, people, burn...","[{7: 0.8892097}, {5: 0.94086474}, {6: 0.965198...",0.484371


In [20]:
Iter.to_csv('GA-4.csv')