### Review of String methods

These string methods is useful, but it's not enough to solve our `Document Distance Problem`

In [85]:
# 1. read the text from a file
text = '  ；The Zen of Python, by Tim Peters  '
print(text)

# 2. remove the front and end space of the line
output1 = text.strip()
print(output1)

# 3. to convert all the characters in the string to lower case
output2 = output1.lower()
print(output2)

# 4. split the long string to individual words
output3 = output2.split()
print(output3)

  ；The Zen of Python, by Tim Peters  
；The Zen of Python, by Tim Peters
；the zen of python, by tim peters
['；the', 'zen', 'of', 'python,', 'by', 'tim', 'peters']


### Step 1: Read document from file

In [86]:
import sys

def read_file(filename):
    """ 
    Read the text file with the given filename;
    return a list of the lines of text in the file.
    """
    try:
        f = open(filename, 'r')
        text = f.readlines()
        f.close()
        return text
    except IOError:
        print("Error opening or reading input file: ",filename)
        sys.exit()

# Test the function
read_text = read_file('zen_of_python.txt')
print(read_text)

['a b c\n', '\n']


### Step 2: Split each string into words

In [87]:
def get_words_from_string(line):
    word_list = []
    char_list = []
    
    # c is the character
    for c in line:
        if c.isalnum():
            char_list.append(c.lower())
        elif len(char_list) > 0:
            word = ''.join(char_list)
            word_list.append(word)
            char_list = []
    else:
        if len(char_list) > 0:
            word = ''.join(char_list)
            word_list.append(word)
    
    return word_list

# Test the function
test_str = "   sdf1aj dkfv v'jakl dkf al \n"
test_out = get_words_from_string(test_str)
print(test_out)

['sdf1aj', 'dkfv', 'v', 'jakl', 'dkf', 'al']


### Step 3: Collect all the words at each document into a list

Each line of string treat as a `document`.

In [88]:
def get_words_from_line_list(line_list):
    word_list = []
    for line in line_list:
        words_in_line = get_words_from_string(line)
        word_list.extend(words_in_line)
    return word_list

# Test the function
words = get_words_from_line_list(read_text)
print(words)

['a', 'b', 'c']


### Step 4: Count the frequency of distinct word in the list

In [89]:
def count_frequency(word_list):
    res = {}
    for word in word_list:
        if word in res:
            res[word] += 1
        else:
            res[word] = 1
    return res

# Test the function
word_freq = count_frequency(words)
print(word_freq)

{'a': 1, 'b': 1, 'c': 1}


### Step 5: Use all the functions above in one place

In [90]:
def word_frequencies_for_file(file_path):
    # Step1 - read file
    line_list = read_file(file_path)
    # Step2 used as a subroutine in Step3
    word_list = get_words_from_line_list(line_list)
    # Step4 - count the frequency of distinct words in word_list
    word_freq = count_frequency(word_list)
    
    print("File <", file_path, ">:")
    print(len(line_list), "lines,")
    print(len(word_list), "words,")
    print(len(word_freq), "distinct words.\n")
    return word_freq

res = word_frequencies_for_file('zen_of_python.txt')
print(res)

File < zen_of_python.txt >:
2 lines,
3 words,
3 distinct words.

{'a': 1, 'b': 1, 'c': 1}


### Step 6: Inner product of two vectors

In [91]:
# Version 1 beta
# the inputs are 2 docs
def inner_product_beta(v1, v2):
    sum = 0.0
    
    for key in v1:
        sum = sum + v1[key] * v2[key]
    
    return sum

v1 = {'the': 4, 'cat': 1, 'dog': 0}    # 4, 1, 0
v2 = {'the': 1, 'cat': 0, 'dog': 1}    # 1, 0, 1   

# 4* 1 + 1 * 0 + 0* 1 = 4
ans = inner_product_beta(v1, v2)
print(ans)

4.0


In [92]:
# Version 2
# the inputs are 2 docs
def inner_product(v1, v2):
    sum = 0.0
    
    for key in v1:
        if key in v2:
            # sum = sum + v1[key] * v2[key]
            sum += v1[key] * v2[key]
        # else:
        #   sum += 0
    
    return sum

# dot product
# assume it will be the output of our step 5
v1 = {'the': 4, 'cat': 1}    # 4, 1, 0
v2 = {'the': 1, 'dog': 1}    # 1, 0, 1   

# 4* 1 + 1 * 0 + 0* 1 = 4
ans = inner_product(v1, v2)
print(ans)

4.0


### Step 7: Length of the vector

In [93]:
def vector_length_square(vector):
    sum = 0.0
    for key in vector:
        # sum += vector[key] * vector[key]
        sum += vector[key] ** 2
    return sum

v1 = {'the': 3, 'cat': 4}    # 4, 1, 0
v2 = {'the': 6, 'dog': 8}    # 1, 0, 1   

ans1 = vector_length_square(v1)
print(ans1)

ans2 = vector_length_square(v2)
print(ans2)

25.0
100.0


### Step 8: Vector angle

In [94]:
import math

def vector_angle(v1, v2):
    # call Step 6
    numerator = inner_product(v1, v2)
    # call Step 7
    # sqrt(len_square_v1) * sqrt(len_square_v2)
    denominator = math.sqrt(vector_length_square(v1) * vector_length_square(v2))
    radians_angle = math.acos(numerator / denominator)
    return radians_angle

v1 = {'the': 1, 'dog': 1}    # 4, 1, 0
v2 = {'the': 1, 'dog': 1}    # 1, 0, 1
ans = vector_angle(v1, v2)
print(ans)

0.0


### Step 9: Run the application

In [95]:
import math

def run_app(filename_1, filename_2):
    vector_1 = word_frequencies_for_file(filename_1)
    vector_2 = word_frequencies_for_file(filename_2)
    distance = vector_angle(vector_1, vector_2)
    
    print("The distance between the documents is: %0.6f (radians)" % distance)
    print("The distance between the documents is: %0.6f (degrees)" % math.degrees(distance))

run_app("notes_plan.txt", "zen_of_python.txt")

File < notes_plan.txt >:
1 lines,
1 words,
1 distinct words.

File < zen_of_python.txt >:
2 lines,
3 words,
3 distinct words.

The distance between the documents is: 1.570796 (radians)
The distance between the documents is: 90.000000 (degrees)


In [96]:
import math
math.sqrt(3**2 + 4**2)

5.0

In [97]:
import math

def vector_length_square(vec):
    res = 0
    for v in vec:
        res += v**2
    return res
    
    
v1 = [3, 4] # 5
v2 = [6, 8] # 10

len1_square = vector_length(v1)
len2_square = vector_length(v2)

print(len1)
print(len2)

print(len1 * len2)
print(math.sqrt(len1 * len2))

25
100
2500
50.0


In [98]:
# This will gives you a poem
# An interesting poem that tells you the philosophy behind Python
import this