In [None]:
## Importing the relevant packages:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import math
from statistics import mean
import random
from gensim.parsing.preprocessing import remove_stopwords
import re

In [None]:
from top2vec import Top2Vec

In [None]:
df = pd.read_csv("consumer_survey_data.csv")
print(len(df))

df.head()

In [None]:
df['outcome_1_why']

In [None]:
def get_preprocessed(col_text):
    filtered_sentence = remove_stopwords(col_text)
    words = filtered_sentence.split()
    resultwords = []
    nouns = ['Subway', 'subway', "subway's", "subways", "Subway's",
             'King', 'Burger', 'burgerking', 'BurgerKing', 'burger', 'king', 'burgerking', "king's", "KING", 'BK',
             'Grubhub', 'grubhub', 'GrubHub', 
             'doordash', 'doorDash', 'DoorDash', 'door', 'dash', 'Doordash', 
             'uber', 'Uber', 'lyft', 'Lyft', 
             'giftcard', 'card', 'gift',
             'food', 'restaurant', 'restaurants', 'eat', 'fastfood', 'fast', 
             'eating', 'meal', 'meals', "FOOD", "foods",
             'whopper', 'whoppers', 'Whopper', 'fries', 'coffee', 'breakfast', 'onion', 'ring', 'steak', "whooper",
             'hamburger', 'hamburgers', 'cheeseburger', 'cheeseburgers', 'burgers',
             'burgers', 'subs', 'sandwich', 'sandwiches', 'sub', "SUB", "subwa",
             'ride', 'delivery', 'car', 'transportation', "company", "companies", "brands", "brand", "services"]
    for word in words:
        processed_word = word.lower()
        processed_word = processed_word.strip()
        processed_word = re.sub('(?<=[a-z])\'(?=[a-z])', '', processed_word)
        processed_word = re.sub(r'[^\w\s]','', word)
        #print(word)
        if (processed_word not in nouns):
            resultwords.append(word)
    result = ' '.join(resultwords)
    return result

df['clean_text'] = df['outcome_1_why'].apply(get_preprocessed)

In [None]:
random.seed(12345)
new_model = Top2Vec(df['clean_text'].values, embedding_model = 'universal-sentence-encoder')
print(new_model.get_num_topics())
new_model.topic_words

In [None]:
#### Reducing the number of topics:

# The topic mapping that the function returns is a nested list that explains which 
#topics have been merged together to form the 20 larger topics.
topic_mapping = new_model.hierarchical_topic_reduction(num_topics = 10)

# Looking at the original topics within topic 1:
print(topic_mapping[1])

# Accessing the new topic keywords with the topic_words_reduced attribute:
new_model.topic_words_reduced[1]

new_model.get_topic_hierarchy()

In [None]:
# Searching for documents by topic to investigate topic areas:

new_model.search_documents_by_topic(0, num_docs = 20)

In [None]:
def get_topic_number(index):
    for num_topic in range(new_model.get_num_topics()):
        index_list = new_model.search_documents_by_topic(num_topic, num_docs = new_model.get_topic_sizes()[0][num_topic])[2]
        if (index in index_list):
            return num_topic

df['topic_number'] = df['index'].apply(get_topic_number)

In [None]:
print(new_model.get_topic_hierarchy())

def get_overall_topic_number(topic_number):
    if (topic_number == 0):
        return 1
    elif (topic_number in [7, 22, 15]):
        return 2
    elif (topic_number in [20, 54, 16]):
        return 3
    elif (topic_number in [18, 24, 8]):
        return 4
    elif (topic_number in [5, 4, 25, 12]):
        return 5
    elif (topic_number in [52, 26, 23, 55, 35, 46, 14]):
        return 6
    elif (topic_number in [36, 56, 51, 34, 17, 44, 32, 43, 49, 57, 6]):
        return 7
    elif (topic_number in [27, 39, 19, 2]):
        return 8
    elif (topic_number in [33, 42, 11, 3, 13, 53, 1]):
        return 9
    elif (topic_number in [10, 58, 41, 29, 21, 40, 47, 38, 48, 30, 31, 28, 9, 37, 45, 50]):
        return 10

df['topic_number_reduced'] = df['topic_number'].apply(get_overall_topic_number)

In [None]:
def get_overall_topic_name(topic_number):
    if (topic_number == 0):
        return "Misinformation"
    elif (topic_number in [7, 22, 15]):
        return "Local availability"
    elif (topic_number in [20, 54, 16]):
        return "Food"
    elif (topic_number in [18, 24, 8]):
        return "Frequent use"
    elif (topic_number in [5, 4, 25, 12]):
        return "Love"
    elif (topic_number in [52, 26, 23, 55, 35, 46, 14]):
        return "Likely to use"
    elif (topic_number in [36, 56, 51, 34, 17, 44, 32, 43, 49, 57, 6]):
        return "Preference"
    elif (topic_number in [27, 39, 19, 2]):
        return "Like"
    elif (topic_number in [33, 42, 11, 3, 13, 53, 1]):
        return "Use service"
    elif (topic_number in [10, 58, 41, 29, 21, 40, 47, 38, 48, 30, 31, 28, 9, 37, 45, 50]):
        return "Product features"

df['topic_name'] = df['topic_number'].apply(get_overall_topic_name)

In [None]:
df['topic_number_reduced'].value_counts()
df['topic_name'].value_counts()

In [None]:
#df.to_csv("text_outcome_1_analyzed.csv")

## Adding topics to full data:

In [None]:
data_df = pd.read_csv("consumer_survey_data.csv")
print(len(data_df))

In [None]:
outcome_1_df = pd.read_csv("text_outcome_1_analyzed.csv")
print(len(outcome_1_df))

outcome_1_dict = dict(zip(outcome_1_df['doc_id'], outcome_1_df['topic_name']))
print(len(outcome_1_dict))

data_df['outcome_1_topic'] = data_df['ResponseId'].map(outcome_1_dict)

In [None]:
outcome_2_df = pd.read_csv("text_outcome_2_analyzed.csv")
print(len(outcome_2_df))

outcome_2_dict = dict(zip(outcome_2_df['doc_id'], outcome_2_df['topic_name']))
print(len(outcome_2_dict))

data_df['outcome_2_topic'] = data_df['ResponseId'].map(outcome_2_dict)

In [None]:
#data_df.to_csv("consumer_survey_data.csv")