## FCAN Code
#### Will Russell 
November 2017
<hr />
<div>
    <p> This is the text miner code for the article found in FormingClustersAsNeeded.ipynb </p>
</div>

In [51]:
from Project4_code import Porter_Stemmer_Python as ps
import re
import pandas as pd

In [52]:
def retrieve_text(relativePath="./", fileName="sentences.txt"):
    with open(fileName,  newline="\n") as f:
        sentences = f.readlines()
    sentences = [x.strip() for x in sentences]
    sentences = [x for x in sentences if len(x) > 0]
    return sentences

def convert_to_lower(sentences):
    lowercase_sentences = []
    for sentence in sentences:
        lowercase_sentences.append(sentence.lower())
    return lowercase_sentences

def strip_punctuation(word_token_list):
    new_list = []
    for word in word_token_list:
        p2 = re.compile('[\W_]+')
        result = p2.sub("",word)
        if len(result) > 0:
            new_list.append(result)
    return new_list

def strip_text(sentences):
    new_text = []
    for sentence in sentences:
        new_text.append(strip_punctuation(sentence))
    return new_text

def strip_numbers(sentences):
    stripped_sentences = []
    pattern = re.compile(r'\d')
    for sentence in sentences:
        stripped_sentences.append(re.sub(pattern,"", sentence))
    return stripped_sentences

def split_text(text):
    tokens = []
    for string in text:
        tokens.append(string.split())
    return tokens

def remove_stopwords(sentences, stop_words):
    modified_sentences = []
    for sentence in sentences:
        modified_sentences.append([x for x in sentence if x not in stop_words])
    return modified_sentences

def stem_words(sentence_tokens):
    pstemmer = ps.PorterStemmer()
    stemmed_sentence = []
    for token in sentence_tokens:
        stemmed_sentence.append(pstemmer.stem(token,0, len(token)-1))
    return stemmed_sentence

def stem_text(tokenized_sentences):
    stemmed_sentences = []
    for sentence in tokenized_sentences:
        stemmed_sentences.append(stem_words(sentence))
    return stemmed_sentences

def flatten_text(sentences):
    return [token for sentence in sentences for token in sentence]

def generate_token_set(tokens):
    return set(tokens)

def generate_frequency_count_for_sentence(sentence, token_set):
    token_hash = {}
    for token in token_set:
        token_hash[token] = 0
        if token in sentence:
            token_hash[token] += 1
    return token_hash

def generate_text_frequency_vector(text, token_set):
    tdm_list = []
    for sentence in text:
        freq_hash = generate_frequency_count_for_sentence(sentence, token_set)
        tdm_list.append(freq_hash)
    return tdm_list


# Get the stop words from stop_words.txt file
stop_words = retrieve_text(fileName="stop_words.txt")
# Get the sentences from the sentences.txt file
sentences = retrieve_text(fileName="sentences.txt")

# Convert all sentences to lowercase
sentences = convert_to_lower(sentences)
# Strip all numbers from the sentences
sentences = strip_numbers(sentences)
# Tokenize
sentences = split_text(sentences)

# Strip the punctuation from the text
sentence_tokens = strip_text(sentences)

# Remove stopwords
sentence_tokens = remove_stopwords(sentence_tokens, stop_words)

# Perform stemming
stemmed_sentences = stem_text(sentence_tokens)

# Flatten the text into a single list
stemmed_tokens = flatten_text(stemmed_sentences)

# Generate the unique token set
unique_tokens = generate_token_set(stemmed_tokens)

# Generate the frequency vectors from the tokens
vector_list = generate_text_frequency_vector(stemmed_sentences, unique_tokens)

In [59]:
#DemonStrate the tokens are Unique
print(unique_tokens)


{'great', 'suit', 'home', 'know', 'famili', 'attain', 'obsolesc', 'necessarili', 'lai', 'interior', 'biolog', 'remodel', 'spiritu', 'minut', 'predict', 'escap', 'area', 'well', 'cosmo', 'term', 'trash', 'king', 'road', 'number', 'averag', 'learn', 'entir', 'near', 'us', 'second', 'rai', 'open', 'paradigm', 'nice', 'inventor', 'driven', 'sentienc', 'pet', 'gallon', 'sedan', 'charg', 'feel', 'full', 'secur', 'space', 'sens', 'freshli', 'rout', 'domain', 'room', 'coin', 'selfawar', 'reason', 'paint', 'percent', 'achiev', 'go', 'finish', 'autonom', 'car', 'deal', 'deep', 'kitchen', 'floor', 'includ', 'central', 'new', 'knowledg', 'speed', 'human', 'cute', 'mile', 'emot', 'understand', 'classi', 'world', 'gener', 'singl', 'fundament', 'lisp', 'ga', 'awai', 'everyth', 'languag', 't', 'throughout', 'combin', 'sort', 'five', 'updat', 'over', 'queen', 'have', 'negoti', 'on', 'two', 'water', 'comput', 'recent', 'describ', 'air', 'up', 'machin', 'system', 'work', 'rent', 'far', 'intellig', 'base'

In [54]:
# Now lets check out our very sparse matrix...
df = pd.DataFrame(vector_list)
df_transpose = df.transpose()
# df.to_csv("tdm.csv", sep=",")
# df_transpose.to_csv("transpose_tdm.csv", sep=",")
df.head()
df_transpose.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
accidentfre,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
achiev,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ag,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
air,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
anim,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
import math 

# generates and returns the euclidean distance between 2 vectors
def euclidean_distance(vector1, vector2):
    sum_squares = 0
    for key in vector1:
        sum_squares += (vector1[key] - vector2[key])**2
    return math.sqrt(sum_squares)

# Gets the weighted average for a list of vectors
def get_weighted_average(vector_list):
    avg_hash = {}
    if(len(vector_list) == 0):
        return 0;
    for k in vector_list[0]:
        avg_hash[k] = 0
        for vec in vector_list:
            avg_hash[k] += vec[k]
        avg_hash[k] = avg_hash[k]/len(vector_list)
    return avg_hash

# Generates the cluster for a given weighted average vector
def generate_FCAN(current_avg_vector, new_vector, n, alpha=1):
    new_weighted_avg = {}
    for k in new_vector:
        new_weighted_avg[k] = generate_w_k(current_avg_vector[k], new_vector[k], n, alpha)
    return new_weighted_avg
    
# 
def generate_w_k(weighted_vector, new_vector, m, alpha=1):
    return ((m*weighted_vector) + (alpha*new_vector))/(m+1)


def pull_valid_keys_from_hash(some_hash):
    str_list = []
    for k in some_hash:
        if some_hash[k] != 0:
            str_list.append(k)
    return str_list



In [56]:
# Check to see that the get_weighted_vector method works correctly
# test_df = df.iloc[1:4]
# print(test_df.mean(axis=0))

# Test against dataframe subset to ensure working correctly
#print(get_weighted_average(vector_list[1:4]))

# Check Euclidean Distance is calculated correctly
# print(euclidean_distance(vector_list[0], vector_list[1]))


In [57]:
def generate_FCAN_clusters(vector_list, min_distance=10):
    current_clusters = []
    cluster_avgs = []
    current_clusters.append([vector_list[0]])
    cluster_avgs.append(vector_list[0])
    # Go through the vector list
    for i,vec in enumerate(vector_list[1::]):
        best_cluster_index = None
        found_cluster = False
        for j, cluster_vec in enumerate(cluster_avgs):
            # compare the cluster vec against the new vector
            current_distance = euclidean_distance(vec, cluster_vec)
            # If the distance is less than the needed Euclidean
            if(current_distance < min_distance):
                #Set the current min to be the current distance
                # TODO: Currently Last fit --> Way to do Best Fit???
                current_min = current_distance
                best_cluster_index = j
                found_cluster = True
        if found_cluster:
            ## Get the size of the cluster
            m = len(current_clusters[best_cluster_index])
            current_clusters[best_cluster_index].append(vec)
            # Update the cluster centroid
            new_cluster_avg = generate_FCAN(cluster_avgs[best_cluster_index], vec, m)
            cluster_avgs[best_cluster_index] = new_cluster_avg
        else:
            # No cluster found --> create a new cluster
            current_clusters.append([vec])
            cluster_avgs.append(vec)
    return current_clusters


def display_FCAN_clusters(clusters):
    for i,cluster in enumerate(clusters):
        print("\n------Printing out contents of cluster {}  : Cluster length : {}--------".format(i+1,len(cluster)))
        for j, row in enumerate(cluster):
            row_vals = pull_valid_keys_from_hash(row)
            output = " ".join(row_vals)
            print("{} : {}".format(j+1, output))    
    

In [58]:
clusters = generate_FCAN_clusters(vector_list,4)
display_FCAN_clusters(clusters)


------Printing out contents of cluster 1  : Cluster length : 7--------
1 : road sedan speed mile up hour per type autonom travel
2 : second car mile hour per get kilomet
3 : road charg achiev mile rang around kilomet test
4 : obsolesc biolog escap human sort have machin intellig merger wai musk
5 : charg percent go car mile befor up kilomet
6 : sedan mile hour around per lap kilomet autonom
7 : necessarili learn sentienc machin intellig artifici lead possibl

------Printing out contents of cluster 2  : Cluster length : 1--------
1 : learn year predict human awai machin far intellig artifici kurzweil futur rai superintellig singular selfimprov exce

------Printing out contents of cluster 3  : Cluster length : 15--------
1 : larg home remodel eat full room util kitchen rent live size bedroom newli bath
2 : larg home interior entir room paint live bedroom freshli
3 : home finish hous dryer bedroom four washer basement come row bath
4 : pet space back owner three approv park possibl
5 : r