## Installation + Instructions

To install the required libraries, you'll need to run the following on the command line:

You'll also need to install the stopwords corpus for nltk. This only needs to be done once. To start import the nltk library and then call the nltk.download() method:

In [None]:
import nltk
nltk.download()

This will create a pop up window. To download the stopwords corpus, select the 'Corpora' tab and then scroll to 'stopwords' and select it before clicking the 'Download' button.

## Libraries

In [1]:
import pandas as pd
import re
import nltk
import gensim
import pyLDAvis.gensim

  if 'order' in inspect.getargspec(np.copy)[0]:


## Options

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))       # Changing the cell widths

pd.options.display.max_rows = 30                                            # Setting the max number of rows
pd.options.display.max_columns = 50                                         # Setting the max number of columns

pyLDAvis.enable_notebook()

## Variable Definition

(Plug in your own values here)

In [3]:
indata = '/Data/Inputs/Responses.xlsx' # Input data path + file
insheet = 'Responses'                  # Input xls sheet
outdata = '/Data/Outputs/'             # Output data path

## Import & General Cleaning

In [4]:
df = pd.read_excel(indata,                        # Input spreadsheet location
                   sheetname = insheet,           # Sheetname
                   parse_cols = [2,23,24,25,26])  # Which columns to parse


cols = df.columns                                 # Return a list of the column headings
new_cols = []                                     # Blank list to store the new columns

i = 1                                             # Iterator

for col in cols:                                  # Creating succinct column headers
    col = 'Q' + str(i)
    new_cols.append(col)
    i+= 1

df.columns = new_cols                             # Rename column headings
df.fillna('',inplace = True)                      # Replacing NaN values


## Text Cleaning & Pre-Processing

In [5]:
stops = set(nltk.corpus.stopwords.words("english")) # Creating a set of Stopwords
p_stemmer = nltk.stem.porter.PorterStemmer()        # Creating the stemmer model

def cleaner(row):
    '''Function to clean the text data and prep for further analysis'''
    text = row[col].lower()                         # Converts to lower case
    text = re.sub("[^a-zA-Z]"," ",text)             # Removes punctuation
    text = re.sub("cyclist","cycl",text)            # Manual intervention for 'cyclist'
    text = text.split()                             # Splits the data into individual words 
    text = [w for w in text if not w in stops]      # Removes stopwords
    text = [p_stemmer.stem(i) for i in text]        # Stemming (reducing words to their root)
    return text                                     # Function output

for col in new_cols:
    df[col] = df.apply(cleaner,axis=1)              # Applying the cleaner function to each column

## Converting from a Dataframe to Lists

In [6]:
output_dict = {}                                 # Blank dict to contain the lists for each question
 
for col in new_cols:
    output_dict[col] = []                        # Creating a key & value for each column
    for item in df[col]:                         
        output_dict[col].append(item)            # Appending the contents of each row to the data structure

## Creating a Topic Model


### Input Variables

In [7]:
model_in = output_dict['Q1']                                # Input data
topics = 6                                                  # Number of topics to create
passes = 20                                                 # Number of passes through the data (more passes = more accuracy)

### Modelling 

In [8]:
mydict = gensim.corpora.Dictionary(model_in)                # Creates an id <-> term dictionary
corpus = [mydict.doc2bow(text) for text in model_in]        # convert tokenized documents into a document-term matrix
model = gensim.models.ldamodel.LdaModel(corpus, 
                                        num_topics=topics,  
                                        id2word = mydict,
                                        passes=passes)      # Generate LDA model
                                           

### Save + Visualise

In [10]:
vis = pyLDAvis.gensim.prepare(model, corpus, mydict)        # Visualise LDA Model
#pyLDAvis.save_html(data=vis,fileobj=out + 'name.html')     # Save html output 
vis