## Importing Relevant Libraries:


In [1]:
import os
import pandas as pd
import json
from tqdm import tqdm
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora, models
from gensim.utils import simple_preprocess
import gensim, spacy
from gensim.models.ldamulticore import LdaMulticore
import re
import numpy as np

from gensim.models import Phrases
from gensim.models.phrases import Phraser

## Importing Data:

In [2]:
directory = "D:\Study\Supermind\ds-data"
dfs= []
jsons = []
for root, subdirectories, files in os.walk(directory):
    for file in files:
        filename = os.path.join(root, file)
        f = open(filename)
        data = json.load(f)
        f.close
        jsons.append(data)

In [3]:
chats = {
            "Serial":[],
            "Id":[],
            "Text":[]
        }

In [4]:
for j in range(0,len(jsons)):
    for i in jsons[j]:
        chats["Serial"].append(j)
        chats["Id"].append(i['id'])
        chats["Text"].append(i['text']['text'])
        

In [5]:
df = pd.DataFrame(chats)

In [6]:
df.tail()

Unnamed: 0,Serial,Id,Text
34134,86,377889,That's unrelated. The question is what the inc...
34135,86,377888,If you have to pow on the user side then you'r...
34136,86,377887,there doesn't appear to be any. though if you ...
34137,86,377886,What is the incentive to validate txs in Nano?
34138,86,377885,I remember first hearing about it when it went...


## Using NLTK for Question Detection:

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("shahrukhx01/bert-mini-finetune-question-detection")

model = AutoModelForSequenceClassification.from_pretrained("shahrukhx01/bert-mini-finetune-question-detection")


In [8]:
import nltk

In [9]:
nltk.download('nps_chat')

[nltk_data] Downloading package nps_chat to C:\Users\TANUSH
[nltk_data]     MAHAJAN\AppData\Roaming\nltk_data...
[nltk_data]   Package nps_chat is already up-to-date!


True

In [10]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains({})'.format(word.lower())] = True
    return features

featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts]

# 10% of the total data
size = int(len(featuresets) * 0.1)

# first 10% for test_set to check the accuracy, and rest 90% after the first 10% for training
train_set, test_set = featuresets[size:], featuresets[:size]

# get the classifer from the training set
classifier = nltk.NaiveBayesClassifier.train(train_set)
# to check the accuracy - 0.67
# print(nltk.classify.accuracy(classifier, test_set))

question_types = ["whQuestion","ynQuestion"]
def is_ques_using_nltk(ques):
    question_type = classifier.classify(dialogue_act_features(ques)) 
    return question_type in question_types

## Using Sentence Structure To Detect Questions:

In [11]:
question_pattern = ["do i", "do you", "what", "who", "is it", "why","would you", "how","is there",
                    "are there", "is it so", "is this true" ,"to know", "is that true", "are we", "am i", 
                   "question is", "tell me more", "can i", "can we", "tell me", "can you explain",
                   "question","answer", "questions", "answers", "ask"]

helping_verbs = ["is","am","can", "are", "do", "does"]
# check with custom pipeline if still this is a question mark it as a question
def is_question(question):
    question = question.lower().strip()
    if not is_ques_using_nltk(question):
        is_ques = False
        # check if any of pattern exist in sentence
        for pattern in question_pattern:
            is_ques  = pattern in question
            if is_ques:
                break

        # there could be multiple sentences so divide the sentence
        sentence_arr = question.split(".")
        for sentence in sentence_arr:
            if len(sentence.strip()):
                # if question ends with ? or start with any helping verb
                # word_tokenize will strip by default
                first_word = nltk.word_tokenize(sentence)[0]
                if sentence.endswith("?") or first_word in helping_verbs:
                    is_ques = True
                    break
        return is_ques    
    else:
        return True

In [13]:
df['is_question'] = ""

In [14]:
df['is_question_using_nltk'] = ""

In [15]:
df.head()

Unnamed: 0,Serial,Id,Text,is_question,is_question_using_nltk
0,0,377801,wat is it lmao,,
1,0,377800,No risk here platform will be free,,
2,0,377799,@lordvladin im more confused after I read your...,,
3,0,377798,has anyone sold or bought a house using crypto...,,
4,0,377796,I sent you DM thank you,,


In [16]:
len(df)

34139

In [17]:
for i in tqdm(range(len(df)-1)):
    if(df['Text'][i]):
        df.at[i,'is_question'] = str(is_question(df['Text'][i]))
        df.at[i,"is_question_using_nltk"] = str(is_ques_using_nltk(df['Text'][i]))   
    else:
        df.at[i,'is_question'] = "None"
        df.at[i,"is_question_using_nltk"] = "None"   

100%|███████████████████████████████████████████████████████████████████████████| 34138/34138 [00:59<00:00, 572.09it/s]


In [18]:
df.head(30)

Unnamed: 0,Serial,Id,Text,is_question,is_question_using_nltk
0,0,377801,wat is it lmao,True,False
1,0,377800,No risk here platform will be free,False,False
2,0,377799,@lordvladin im more confused after I read your...,False,False
3,0,377798,has anyone sold or bought a house using crypto...,True,False
4,0,377796,I sent you DM thank you,False,False
5,0,377795,Can DM me. I'm out rn but will get back by lat...,True,False
6,0,377794,Dequest is an sbt2 layer on web3 gamification ...,False,False
7,0,377793,"I think dequest focusing on web3 games, as i s...",False,False
8,0,377792,but feel free to dm me if using rmrk,False,False
9,0,377791,look into dequest too,False,False


In [20]:
# determining the name of the file
def uniquify(path):
    filename, extension = os.path.splitext(path)
    counter = 1

    while os.path.exists(path):
        path = filename + " (" + str(counter) + ")" + extension
        counter += 1

    return path
file_name = 'Dataset.xlsx'
  
# saving the excel
df.to_excel(uniquify(file_name))

## Extracting Keywords From Questions For Tags

In [None]:
PATH = './Dataset_Questions'
df_questions = pd.read_excel (f'{PATH}.xlsx')

In [None]:
import yake

In [None]:
df_questions["yake"] = ""
kw_extractor = yake.KeywordExtractor()

for i in tqdm(range(len(df_questions))):
    keywords = kw_extractor.extract_keywords(df_questions["Text"][i])
    ls = []
    for kw in keywords:
        str1 = kw[0].split(" ")
        if(kw[1]>0.05) and len(str1) ==1:
          ls.append(kw[0])
    df_questions.at[i,'yake']  = listToString2(ls)
        

## Saving Result to Excel File

In [None]:
# determining the name of the file
def uniquify(path):
    filename, extension = os.path.splitext(path)
    counter = 1

    while os.path.exists(path):
        path = filename + " (" + str(counter) + ")" + extension
        counter += 1

    return path
file_name = 'Dataset_Questions_With_Keywords.xlsx'
  
# saving the excel
df_questions.to_excel(uniquify(file_name))

## Other Aproaches:

We Can use GSDMM for topic modelling and forming groups of Documents Which have similar theme.

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:

        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

# create N-grams
def make_n_grams(texts):
    bigram = gensim.models.Phrases(texts, min_count=5, threshold=100)  # higher threshold fewer phrases.
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram = gensim.models.Phrases(bigram[texts], threshold=100)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    bigrams_text = [bigram_mod[doc] for doc in texts]
    trigrams_text =  [trigram_mod[bigram_mod[doc]] for doc in bigrams_text]
    return trigrams_text

In [None]:
tokens_reviews = list(sent_to_words(df["Text"]))
len(tokens_reviews)

In [None]:
tokens_reviews = make_n_grams(tokens_reviews)
len(tokens_reviews)

In [None]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in gensim.parsing.preprocessing.STOPWORDS] for doc in texts]

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# do lemmatization keeping only noun, vb, adv ----------> ADD ADJECTIVES MAYBE???????
# because adj is not informative for reviews topic modeling
text_lemmatized = lemmatization(tokens_reviews, allowed_postags=['NOUN', 'VERB'])

# remove stop words after lemmatization
text_lemmatized = remove_stopwords(text_lemmatized)

In [None]:
np.random.seed(0)

In [None]:
from gsdmm import MovieGroupProcess

In [None]:
model_k = 40
model_alpha = 0.2
model_beta = 0.2
model_iters = 50
mgp = MovieGroupProcess(K=model_k, alpha=model_alpha, beta=model_beta, n_iters=model_iters)

vocab = set(x for text in text_lemmatized for x in text)
n_terms = len(vocab)
model = mgp.fit(text_lemmatized, n_terms)

In [None]:
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster,sort_dicts))

In [None]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topic :')

for i in range(len(doc_count)):

    print(i,"->",doc_count[i],end = ', ')

# This is for all topics
top_index = doc_count.argsort()[::-1]
print('\nMost important clusters (by number of docs inside):', top_index)

temp=[]
for i in range(len(top_index)):
#     print(top_index[i], end="#")
    if(doc_count[top_index[i]]==0):
        
        break
    temp.append(top_index[i])
#         top_index = np.delete(top_index, i)
top_index=np.array(temp)
print('\nMost important clusters (by number of docs inside) without Zeroes:', top_index)

# show the top 5 words in term frequency for each cluster 
print("show the top 10 words in term frequency for each cluster")
top_words(mgp.cluster_word_distribution, top_index, 10)