In [None]:
# Module for extracting keywords from Resofact survey data, and grouping them
# Changes from previous versions:
# - path for data is no longer hardcoded, it is computed based on the location of the current script
# - reading inputs from Excel file: the number of rows is no longer hard-coded
# - Extract keywords from each response rather than from the collection of all responses
# - Focus on single-word keywords, not phrases
# - Identify adjectives or nouns that modify those keywords, e.g. new products
# - Semantically cluster these attributes
# - If two keywords are nouns and appear one immediately after the other, we consider them a compound keyword
# - Extract adjectives that modify these compound keywords
# - After grouping the two keywords in the keyword compound, treat the keywords when they appear alone as any other keyword
# - Allow acronyms in the lemmatized response text used for creating compound keywords

# Load libraries:

from topia.termextract import extract  # installed with pypm install topia.termextract
# import operator
import nltk
import re
import os
import xlrd
# import xlwt
import csv
import time


# Function that maps Penn pos tags to WordNet pos tags:
def map_pos(pos_var):
    if pos_var.startswith("N"):
        return 'n'
    elif pos_var.startswith("J"):
        return 'a'
    elif pos_var.startswith("V"):
        return 'v'
    elif pos_var.startswith("R"):
        return 'r'
    else:
        return 'other'


# Function that replaces a string with the sequence of lemmas of its tokens:

def lemmatize_text(string, wnl_var):
    tokens = nltk.word_tokenize(string)
    pos_tokens = nltk.pos_tag(tokens)
    # print "pos_tokens:" + str(pos_tokens)
    my_raw_lemmas = []
    for i in range(0, len(tokens)):
        token = tokens[i]
        pos = pos_tokens[i][1]
        # print "pos", pos, "token", token
        # my_lemma = wnl_var.lemmatize(token, pos='n')  # assume noun since that's what term algorithm returns
        if map_pos(pos) != 'other':
            my_lemma = wnl_var.lemmatize(token, map_pos(pos))
        else:
            my_lemma = wnl_var.lemmatize(token)

        my_raw_lemmas.append(my_lemma)
    # lemma_string = ' '.join(my_raw_lemmas)
    return my_raw_lemmas


def clean_string(string):
    string = string.replace("\r\n", "\n").replace("\n\n", "\n").replace("\n", ".").replace("..", "."). \
        replace("?.", "?").replace(
        "!.", "!").replace(".", ". ").replace("?", "? ").replace("!", "! ")

    return string

 
# ------------------------------------------------------------------------
# Instantiate a new term extractor:
# -----------------------------------------------------------------------
extractor = extract.TermExtractor()
# extractor.filter = extract.DefaultFilter(singleStrengthMinOccur=2)
extractor.filter = extract.permissiveFilter

# ------------------------------------------------------------------------
# Instantiate a new Word Net lemmatizer:
# -----------------------------------------------------------------------
# wnl = nltk.stem.WordNetLemmatizer()
wnl = nltk.WordNetLemmatizer()

# -----------------------------------------------------------------------
# Data structure for key word adjectives
# ----------------------------------------------------------------------


class KwAdjective:
    def __init__(self):
        self.key_word = None
        self.adjectives = []

    def return_adjectives(self):
        return set(self.adjectives)

# -----------------------------------------------------------------------
# Function that returns the list of adjectives modifying a keyword:
# -----------------------------------------------------------------------

def find_adjectives_kw(kw_var, response_lemmatized, response_tagged, type_var):

    kw_adjective = ""
    kw_index = 0
    if type_var == "single":  # if the keyword is a single word

        if kw_var in response_lemmatized:
        # get position of kw in the tokenized response:
            kw_index = response_lemmatized.index(kw_var)

    elif type_var == "compound": # if the keyword is a compound of two keywords
        kw1 = kw_var.split(" ")[0]
        kw2 = kw_var.split(" ")[1]

        if kw1 in response_lemmatized and kw2 in response_lemmatized:
            # I consider the index of the first item of the compound for the purpose of finding its adjectives:
            kw_index = response_lemmatized.index(kw1)

    if kw_index > 0:
        kw_prev_index = kw_index - 1

        # find adjectives modifying keywords:
        if response_tagged[kw_prev_index][1] == 'JJ':
            kw_adjective = response_tagged[kw_prev_index][0]

    return kw_adjective

# -----------------------------------------------------------------------------------
# Functions that return the list of adjectives of the form keyword + BE + adjective:
# -----------------------------------------------------------------------------------

def find_adjectives_be_kw(kw_var, response_lemmatized, response_tagged, type_var):

    kw_adjective = ""
    if kw_var in response_lemmatized:
        # get position of kw in the tokenized response:
        if type_var == "single":
            kw_index = response_lemmatized.index(kw_var)
            kw_next_index = kw_index + 1
            kw_next2_index = kw_index + 2
        elif type_var == "compound":
            kw1 = kw_var.split(" ")[0]
            kw2 = kw_var.split(" ")[1]
            # I consider the index of the first item of the compound for the purpose of finding its adjectives:
            kw_index = response_lemmatized.index(kw1)
            kw_next_index = kw_index + 2
            kw_next2_index = kw_index + 3

        if kw_next2_index < len(response_tagged):
            if response_tagged[kw_next2_index][1] == 'JJ' and response_lemmatized[kw_next_index] == 'be':
                kw_adjective = response_tagged[kw_next2_index][0]

    return kw_adjective


def extract_keywords(test, target_word, input_directory, output_directory, output_file_keywords_name, output_file_freq_name, column_number, acronyms, exclude, exclude_keywords):    
    
    # Define directories and files:
    
    print ("\tInitializing...")
    # get path of current file, then change sub directory from "code" to "data"
    
    input_file = target_word + '.xlsx'
        
    # Initialize objects:

    input_for_keyword_extractor = ""
    id_response = {}
    id_response_with_acronyms = {}
    kw_list = list()
    kw2freq = dict()
    kw2adj2freq = dict()
    kw_adjectives = dict()
    kw_adj_response = dict()

    # Prepare output file:

    outfile = open(os.path.join(output_directory, output_file_keywords_name), 'w')
    output = csv.writer(outfile, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
    output.writerow(["Id", "Response", "Keyword", "Adjectives"])

    # Read in input file:
    
    sheet_name = target_word
    row_id = 0
    assert os.path.exists(os.path.join(input_directory, input_file)), "I did not find the file "+str(input_file) + " in " + str(input_directory)
    sheet = xlrd.open_workbook(os.path.join(input_directory, input_file)).sheet_by_name(sheet_name)

    print ("\tReading input...")
    if test == "yes":
        n = 10
    else:
        n = sheet.nrows
    for row_idx in range(2, n):
    #for row_idx in range(7, 8):
        if column_number == 0:
            row_id += 1
        else:
            row_id = sheet.cell(row_idx, 0).value

        print( "\t\tReading response number " + str(row_id))
        response = sheet.cell(row_idx, column_number).value
        
        # response = line[2].lower()
        # if a response has a newline, replace it with a full stop so that the keyword extractor makes sense:
        if '\n' in response:
            response = clean_string(response)
        # add a full stop at the end of each response, otherwise the keyword extractor concatenates the responses:
        if not response.endswith(".") and not response.endswith("?") and not response.endswith("!"):
            response += " %s" % "."
        # if a response has a list, replace the list bullets with a full stop:
        if re.search(r'\d *?\)', response):
            response = re.sub(r'\d *?\)', '. ', response)
        response = response.replace(" - ", ". ")
        response_lower = response.lower()

        # replace -ing forms at the start of response with corresp. lemma. E.g. meeting -> meet
        response_tokens = nltk.word_tokenize(response_lower)

        if response_tokens[0].endswith('ing'):
            start_lemma = wnl.lemmatize(response_tokens[0], pos='v')
            response_tokens[0] = start_lemma
            response_lower = ' '.join(response_tokens)
        # remove acronyms from the responses:
        response_no_acronyms = response_lower
        for acronym in acronyms:
            my_regex = r"^(.+?)\b" + acronym + r"\b(.+?)$"
            if re.search(my_regex, response_lower):
                # tmp_response = response_noacronyms.replace(acronym.lower(), "")
                tmp_response = re.sub(my_regex, r'\1 \2', response_no_acronyms)
                response_no_acronyms = tmp_response

        id_response_with_acronyms[row_id] = response_lower

        response_no_acronyms = response_no_acronyms.replace("  ", " ").replace(" . ", ". ").replace("..", ".").replace(
            " . ", ".")
        response_no_acronyms = response_no_acronyms.replace(".", ". ").replace("  ", " ").replace(" . ", ". "). \
            replace("..", ".").replace(' . ', ".").replace("/", " ")
        response_no_acronyms = re.sub(r'[,\.;:\?!] *?\. *?', '. ', response_no_acronyms)
        response_no_acronyms = response_no_acronyms.replace("..", ".")
        
        response_yes_acronyms = response
        response_yes_acronyms = response_yes_acronyms.replace("  ", " ").replace(" . ", ". ").replace("..", ".").replace(
            " . ", ".")
        response_yes_acronyms = response_yes_acronyms.replace(".", ". ").replace("  ", " ").replace(" . ", ". "). \
            replace("..", ".").replace(' . ', ".").replace("/", " ")
        response_yes_acronyms = re.sub(r'[,\.;:\?!] *?\. *?', '. ', response_yes_acronyms)
        response_yes_acronyms = response_yes_acronyms.replace("..", ".")

        for w in exclude:
            my_regex = r"^(.*?)\b" + w + r"\b(.*?)$"
            if re.search(my_regex, response_no_acronyms):
                tmp_response = re.sub(my_regex, r'\1 \2', response_no_acronyms)
                response_no_acronyms = tmp_response
            if re.search(my_regex, response_yes_acronyms):
                tmp_response = re.sub(my_regex, r'\1 \2', response_no_acronyms)
                response_yes_acronyms = tmp_response
                
        id_response[row_id] = response_no_acronyms

        # ------------------------------------------------------------------------
        # Extract keywords:
        # -----------------------------------------------------------------------
        formatted_no_acronyms = re.sub(r'\bi\b', 'I', response_no_acronyms.lower())
        #print "formatted_no_acronyms:" + str(formatted_no_acronyms)
        my_extractor = extractor(formatted_no_acronyms)

        my_response_tokenized_no_acronyms = nltk.word_tokenize(formatted_no_acronyms)
        #print "my_response_tokenized_no_acronyms:" + str(my_response_tokenized_no_acronyms)
        my_response_tagged_no_acronyms = nltk.pos_tag(my_response_tokenized_no_acronyms)
        #print "my_response_tagged_no_acronyms:" + str(my_response_tagged_no_acronyms)
        my_response_lemmatized_no_acronyms = lemmatize_text(formatted_no_acronyms, wnl)
        #print "my_response_lemmatized_no_acronyms:" + str(my_response_lemmatized_no_acronyms)
        
        formatted_yes_acronyms = re.sub(r'\bi\b', 'I', response_yes_acronyms)
        #print "formatted_yes_acronyms:" + str(formatted_yes_acronyms)
        #my_extractor = extractor(formatted_yes_acronyms)

        my_response_tokenized_yes_acronyms = nltk.word_tokenize(formatted_yes_acronyms)
        my_response_tagged_yes_acronyms = nltk.pos_tag(my_response_tokenized_yes_acronyms)
        my_response_lemmatized_yes_acronyms = lemmatize_text(formatted_yes_acronyms, wnl)

        # ----------------------------------------------------------------------
        # Select keywords:
        # ----------------------------------------------------------------------

        kws_response = list()
        for item in my_extractor:
            kw = item[0]
            print ("\t\t\tkw:" + kw)
            # exclude keywords that do not contain any alphabetical characters;
            # only keep one-word keywords and exclude certain words from the list of keywords:
            if re.search('[a-zA-Z]', kw) and len(kw.split(" ")) < 2 and kw not in exclude_keywords:
                kw_list.append(kw)
                kws_response.append(kw)
                kw2freq[kw] = kw2freq.get(kw, 0) + 1

                # --------------------------------------------------
                # find adjectives associated with this keyword:
                # --------------------------------------------------
                
                #print "response:" + str(my_response_lemmatized_yes_acronyms)
                kw_adj_response[kw] = ""
                my_adjective = ""
                my_adjective = find_adjectives_kw(kw, my_response_lemmatized_yes_acronyms, my_response_tagged_yes_acronyms, "single")
                #print "adj:" + my_adjective
                my_adjective_be = ""
                my_adjective_be = find_adjectives_be_kw(kw, my_response_lemmatized_yes_acronyms, my_response_tagged_yes_acronyms, "single")

                for my_adj in [my_adjective, my_adjective_be]:
                    if my_adj != "" and my_adj is not None:
                        # exclude adjectives that do not contain any alphabetical characters:
                        if re.search('[a-zA-Z]', my_adj):
                            kw_adj_response[kw] = my_adj.lower().rstrip()
                            if (kw, my_adj) in kw2adj2freq:
                                kw2adj2freq[(kw, my_adj)] += 1
                            else:
                                kw2adj2freq[(kw, my_adj)] = 1
                            if kw in kw_adjectives:
                                kw_adjectives[kw].adjectives.append(my_adj)
                            else:
                                kw_a_object = KwAdjective()
                                kw_a_object.key_word = kw
                                kw_a_object.adjectives.append(my_adj)
                                kw_adjectives[kw] = kw_a_object
                                
        # -----------------------------------------------------
        # Add acronyms to the list of keywords:
        # -----------------------------------------------------

        for acronym in acronyms:
            freq_acronym = 0
            response_withacronyms = response
            my_regex = r".+?\b" + acronym + r"\b.+?"
            if re.search(my_regex, response_lower):
                kw_list.append(acronym)
                kws_response.append(acronym)
                kw2freq[acronym] = kw2freq.get(acronym, 0) + 1

        # If two keywords are nouns and appear one immediately after the other, we consider them a compound keyword;
        # e.g. "brand name", "track record":
        
        #print my_response_lemmatized_yes_acronyms
        for kw1 in kws_response:
            for kw2 in kws_response:
                if kw1 != kw2 and len(kw1.split(" ")) == 1 and len(kw2.split(" ")) == 1:
                    compound_kw = ""
                    try:
                        index_kw1 = my_response_lemmatized_yes_acronyms.index(kw1)
                        index_kw2 = my_response_lemmatized_yes_acronyms.index(kw2)
                        #print "index 1:" + str(index_kw1)
                        #print "index 2:" + str(index_kw2)                        
                        if index_kw2 > 1 and index_kw1 == index_kw2 - 1:
                            compound_kw = kw1 + " " + kw2
                        elif index_kw1 > 1 and index_kw2 == index_kw1 - 1:
                            compound_kw = kw2 + " " + kw1
                        #print "compound:" + compound_kw
                    except:
                        #print "Error for " + kw1 + " and " + kw2
                        compound_kw = ""
                        
                    if compound_kw != "":
                        kws_response.append(compound_kw)
                        kw_list.append(compound_kw)
                        kw2freq[compound_kw] = kw2freq.get(compound_kw, 0) + 1
                        #print "frequency:" + str(kw2freq[compound_kw])

                        # add adjectives of compound keyword:
                        kw_adj_response[compound_kw] = ""
                        try:
                            my_adjective_c = find_adjectives_kw(compound_kw, my_response_lemmatized_yes_acronyms, my_response_tagged_yes_acronyms, "compound")
                        except:
                            my_adjective_c = ""
                        #print "adjective:" + my_adjective_c
                        try:
                            my_adjective_c_be = find_adjectives_be_kw(compound_kw, my_response_lemmatized_yes_acronyms, my_response_tagged_yes_acronyms, "compound")
                        except:
                            my_adjective_c_be = ""
                        #print "adjective be:" + my_adjective_c_be

                        for my_adj_c in [my_adjective_c, my_adjective_c_be]:
                            if my_adj_c != "":
                                if re.search('[a-zA-Z]', my_adj_c):
                                    kw_adj_response[compound_kw] = my_adj_c.lower()
                                    if (compound_kw, my_adj_c) in kw2adj2freq:
                                        kw2adj2freq[(compound_kw, my_adj_c)] += 1
                                    else:
                                        kw2adj2freq[(compound_kw, my_adj_c)] = 1
                                    if compound_kw in kw_adjectives:
                                        kw_adjectives[compound_kw].adjectives.append(my_adj_c)
                                    else:
                                        kw_a_object = KwAdjective()
                                        kw_a_object.key_word = compound_kw
                                        kw_a_object.adjectives.append(my_adj_c)
                                        kw_adjectives[compound_kw] = kw_a_object
                    

        # --------------------------------------------------
        # Print out keywords:
        # --------------------------------------------------
        kws_response = set(kws_response)
        for kw in kws_response:
            output.writerow([row_id, response, kw, kw_adj_response.get(kw, "")])

    outfile.close()

    # ---------------------------------------------
    # Open output file with keywords' frequency:
    # ---------------------------------------------
    
    print ("\tWriting output...")
    outfile_freq = open(os.path.join(output_directory, output_file_freq_name), 'w')
    output_freq = csv.writer(outfile_freq, delimiter=',', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
    output_freq.writerow(
        ["Keyword", "Number of responses of keyword", "Adjective", "Number of responses of keyword and adjective"])
    kw_list = sorted(list(set(kw_list)))

    count = 0
    for kw in kw_list:
        count += 1
        print ("\t\t" + str(count) + " out of " + str(len(kw_list)) + ": Printing frequencies: " + kw)
        if kw in kw_adjectives:
            my_adj_list = kw_adjectives[kw].return_adjectives()
        else:
            my_adj_list = list()

        if len(my_adj_list) > 0:
            for adj in my_adj_list:
                output_freq.writerow([kw, str(kw2freq[kw]), adj, str(kw2adj2freq[(kw, adj)])])
        else:
            output_freq.writerow([kw, str(kw2freq[kw]), "", ""])

    outfile_freq.close()


In [None]:
# Pipeline for data processing and analysis for Resofact
# Version: 1.2
# Date: 27/11/2016
# Authors: Gard Jenset and Barbara McGillivray

# ----------------------------------
# Import modules:
# ----------------------------------

# For keyword extraction:

from __future__ import division
from topia.termextract import extract  # installed with pypm install topia.termextract
import nltk
import re
import os
import xlrd
import csv
import time

# For semantic clustering:

import math
import numpy as np
import sys
#import os
#import csv
#import time
import semantic_similarity_functions # our module

# For word clouds:

#import os
#import csv
from pytagcloud import create_tag_image, make_tags
from pytagcloud.lang.counter import get_tag_counts

# For bar plots:
import matplotlib.pyplot as plt; plt.rcdefaults()
#import numpy as np
#import os
#import csv

# ---------------------------------------
# Command-line interface:
# ---------------------------------------

# import argparse

# # instantiate a new argument parser
# parser = argparse.ArgumentParser()

# # these are the options for the interface:
# parser.add_argument('--dir', '-d', required=True, help='Full path to directory containing DT HTML output files.')
# parser.add_argument('--weekly', '-w', required=False, action="store_true",
                    # help='''Time period to do report for: Including it will accumulate results
                    # within the same week. Excluding it will accumulate results within the same day.''')
# parser.add_argument('--clean', '-c', required=False,
                    # help='Optional string/regex to clean from start of DT html filename for readability. E.g.: HP_DT_.')

# # parse what's been entered...
# args = parser.parse_args()

# # can now do...
# args.dir
# args.weekly
# args.clean


# Parameters:

test = input("Is this a test? Reply yes or no. Leave empty for yes.")
target_word = input("What is the target word for the survey? Leave empty for pride")  # 'embarassment'#'success'
    
if target_word == "":
    target_word = "pride"
    
input_directory = input("What is the path to the input directory where the spread sheet with the responses is? Leave empty for default (company_name\company_name_pride\). The input spread sheet should have the same name as the target word and the responses should be in the sheet whose name is the target word.")
output_directory = input("What is the path to the output directory where the output files should be saved? Leave empty for default (company_name\company_name_pride\).")
acronyms_file = input("What is the name of the file containing the acronyms to be excluded from the list of keywords? Leave empty for default (hard_coded here or create acronyms_list_default.txt). Note that this file should be in the input folder.")
min_similarity_score = input("What is the minimum threshold for the semantic similarity score? Leave empty for 0.8.")
column_number = input("What is the number of the column containing the survey responses in the input spread sheet? Leave empty for 0.")  # 12
n_words = input("What is the maximum number of words to include in the plots? Leave empty for 10.")
min_freq = input("What is the frequency threshold above which you want to view keywords in the network graph? Leave empty for 3.")

# ---------------------------------------
# File and directory names
# ---------------------------------------
   
#path_keyword_freq = path + r'\output\company_name\company_name_' + target_word
output_file_freq_name = 'Keywords_frequency_' + target_word + "_" + time.strftime("%d%m%Y") + ".csv"
output_word_cloud = 'word_cloud_' + target_word + "_" + time.strftime("%d%m%Y") + ".png"
output_bar_plot = 'bar_plot_' + target_word + "_" + time.strftime("%d%m%Y") + ".png"
output_graph = 'graph_' + target_word + "_" + time.strftime("%d%m%Y") + "_freqthreshold" + str(min_freq) + ".png"
output_file_cluster = 'Keywords_frequency_'+ target_word + "_" + time.strftime("%d%m%Y") + "_clustered.csv"
output_file_keywords_name = 'Responses_keywords_' + target_word + "_" + time.strftime("%d%m%Y") + ".csv"
output_file_sim_name = 'Keywords_frequency_'+ target_word + "_" + time.strftime("%d%m%Y") + "_similarities.csv"

# Default parameters:

if test == "":
    test = "yes"
    
if min_similarity_score == "":
    min_similarity_score = 0.8
    
if column_number == "":
    column_number = 0
else:
    column_number = int(column_number)
    
if n_words == "":
    n_words = 10
else:
    n_words = int(n_words)
    
if acronyms_file == "":
    acronyms_file = "acronyms_list_default.txt"
    
if input_directory == "":
    path = os.getcwd().replace("code", "data")
    input_directory = 'company_name\\company_name_pride\\'
    
if output_directory == "":
    path = os.getcwd().replace("code", "data")
    output_directory = 'company_name\\company_name_pride\\'
    
assert os.path.exists(input_directory), "I did not find the input directory "+str(input_directory)
assert os.path.exists(output_directory), "I did not find the output directory "+str(output_directory)

#path = os.getcwd().replace("code", "data")
#path_plots = os.getcwd() + r'\plots'
path_plots = os.path.join(output_directory, "plots")
if not os.path.exists(path_plots):
    os.makedirs(path_plots)
    
if min_freq == "":
    min_freq = 3
else:
    min_freq = int(min_freq)
    
# Test files:    
    
if test == "yes":
    output_file_freq_name = output_file_freq_name.replace(".csv", "_test.csv")
    output_word_cloud = output_word_cloud.replace(".csv", "_test.csv")
    output_bar_plot = output_bar_plot.replace(".csv", "_test.csv")
    output_file_cluster = output_file_cluster.replace(".csv", "_test.csv")
    output_file_keywords_name = output_file_keywords_name.replace(".csv", "_test.csv")
    output_file_sim_name = output_file_sim_name.replace(".csv", "_test.csv")
    
# ----------------------------------------------------------
# Exclusion lists
# ----------------------------------------------------------

# List of words to be excluded, for example "e.g."

exclude = ['e. g. ', 'e.g.']

# Words to be excluded as keywords:

exclude_keywords = ['i']

# Exlude names of companies:

if target_word == "pride" or target_word == "embarassment":
    exclude_keywords.append('replace_with_company_name')
   
# List of acronyms that should not be lemmatized:
#for many acronyms create a file in input_directory
#assert os.path.exists(os.path.join(input_directory, acronyms_file)), "I did not find the file for acronyms "+str(acronyms_file) + " in " + str(input_directory)
#with open(os.path.join(input_directory, acronyms_file)) as acronyms_f:
#    acronyms = acronyms_f.read().splitlines()
acronyms = ['SA', 'HR', 'ICT', 'IT']


# ----------------------------------------------------------
# Extract keywords
# ----------------------------------------------------------

#from resofact_topic_extraction_27112016 import extract_keywords

# Extract keywords and write them to a file:

print ("Extracting keywords...")
skip = input("Do you want to skip this step? Leave empty if you want this.")
if skip != "yes" and skip != "":
    extract_keywords(test, target_word, input_directory, output_directory, output_file_keywords_name, output_file_freq_name, 
    column_number, acronyms, exclude, exclude_keywords)


# -----------------------------------
# Calculate semantic similarity:
# -----------------------------------

from semantic_similarity_27112016 import calculate_keyword_semantic_similarity

print( "Calculating semantic similarity between keywords...")
skip = input("Do you want to skip this step? Leave empty if you want this.")
if skip != "yes" and skip != "":
    calculate_keyword_semantic_similarity(test, target_word, output_directory, output_file_freq_name, output_file_sim_name, min_similarity_score)

# ----------------------------------
# Cluster keywords:
# ----------------------------------

from clustering_27112016 import cluster_keywords

print( "Clustering keywords...")
skip = input("Do you want to skip this step? Leave empty if you want this.")
if skip != "yes" and skip != "":
    cluster_keywords(test, target_word, min_similarity_score, output_directory, output_file_freq_name, output_file_sim_name, output_file_cluster)
    
# ---------------------------------
# Create word clouds:
# ---------------------------------

print( "Creating word clouds...")

from create_word_cloud_23102016 import create_word_cloud

skip = input("Do you want to skip this step? Leave empty if you want this.")
if skip != "yes" and skip != "":
    # One for clusters (clusters include singletons) and their frequencies:
    #create_word_cloud(path_keyword_freq, output_file_freq_name, path_plots, output_word_cloud)
    create_word_cloud(output_directory, output_file_freq_name, path_plots, output_word_cloud.replace(".png", "_keywords.png"), "keywords")
    create_word_cloud(output_directory, output_file_cluster, path_plots, output_word_cloud.replace(".png", "_clusters.png"), "clusters")
    

# --------------------------------
# Create bar plots:
# --------------------------------

print ("Creating bar plots...")

from create_bar_plot_23102016 import create_bar_plot

skip = input("Do you want to skip this step? Leave empty if you want this.")
if skip != "yes" and skip != "":
    create_bar_plot(output_directory, output_file_freq_name, path_plots, output_bar_plot.replace(".png", "_keywords.png"), n_words, "keywords")
    create_bar_plot(output_directory, output_file_cluster, path_plots, output_bar_plot.replace(".png", "_clusters.png"), n_words, "clusters")

    
# --------------------------------
# Create graphs:
# --------------------------------

print ("Creating graphs...")

from create_graph_27112016 import create_graph

skip = input("Do you want to skip this step? Leave empty if you want this.")
if skip != "yes" and skip != "":
    create_graph(output_directory, output_file_keywords_name, path_plots, output_graph.replace(".png", "_keywords.png"), min_freq)
    
