### Simple implementation of text as vectors for classification

In [21]:
import numpy as np
import pandas as pd
import math
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction import FeatureHasher
import collections
import string
import re
import matplotlib.pyplot as plt
%matplotlib inline
work_dir = %pwd # Get the working directory.
training_set = pd.read_csv(str(work_dir + "/" + "train.csv")) # Read in the training data.
test_set = pd.read_csv(str(work_dir + "/" + "test.csv")) # Read in the test data.
#training_set = training_set[0:100]
training_set.columns = ['id', 'text', 'written_by']
print('Data loaded.')

Data loaded.


In [23]:
def clean_text(text, keep_words = []):
    # Build a list of lists with the split, lower text.
    text = [i.lower().split() for i in text['text']]
    
    # Get rid of words if using a vocabulary.
    if len(keep_words) > 0:
        for i in text:
            for j in i:
                if j not in keep_words:
                    i.remove(j)
    new_lol = []
    # Make a list of lists with punctuation removed.
    for i in text: 
        sub_lol = []
        for j in i:
            sub_lol.append(j.strip(string.punctuation))
        new_lol.append(sub_lol)
    text_lol = new_lol

    # Find all words, their counts, and unique words. 
    all_words = [j for i in text_lol for j in i]
    word_counts = collections.Counter(all_words)
    unique_words = set([j for i in text_lol for j in i])
    
    print('Text cleaned and preprocessed.')
    
    # Get some basic dimensions and create an empty array.
    n_cases = len(text_lol)
    n_words = len(unique_words)
    zero_array = np.zeros((n_cases, n_words))
    count_frame = pd.DataFrame(zero_array, columns = unique_words)
    #print('count frame is', count_frame.iloc[1:10,:])

    #  Get counts for all cases and place in the dataframe.
    for i in range(n_cases):
        count_words = collections.Counter(text_lol[i])
        for j in count_words.keys():
            count_frame.iloc[i][j] += 1
    print('Counts dataframe created')
    
    return count_frame, unique_words

count_frame, vocab = clean_text(training_set)
test_frame, test_vocab = clean_text(test_set, vocab)

test_frame, train_frame = test_frame.align(count_frame, axis = 1, join = 'inner')


Text cleaned and preprocessed.
Counts dataframe created
Text cleaned and preprocessed.
Counts dataframe created


In [24]:
# Reindex by author and count
train_frame.index = training_set['written_by']
by_author_train = train_frame.groupby('written_by').aggregate(sum) 
print('Reindexed, count frame created')


Reindexed, count frame created


In [26]:
def vector_angle_calculator(v1, v2, radians = True):
    '''Calculates angle between two vectors.
    In: 2 vectors (v1, v2); radians or degrees
    Out: radian or degree values
    '''
    # Calculate angle between two vectors
    v1_dot_v2 = sum([v1[i] * v2[i] for i in range(len(v1))])
    #print('Dot product is:', v1_dot_v2)
    mag_v1 = math.sqrt(sum([i**2 for i in v1]))
    #print('Magnitude of first vector is:', mag_v1)
    mag_v2 = math.sqrt(sum([i**2 for i in v2]))
    #print('Magnitude of second vector is:', mag_v1)
    angle_pre_acos = v1_dot_v2 / (mag_v1 * mag_v2)
    #print('this is value going into arcos', angle_pre_acos)
    if angle_pre_acos > 1:
        angle_pre_acos = 1.0
    if angle_pre_acos < -1:
        angle_pre_acos = -1.0
    theta_val = math.acos(angle_pre_acos)
    #print('Theta in radians is:', theta_val)
    #print('Theta in degrees is:', math.degrees(theta_val))
    if radians == True:
        return theta_val
    else:
        return math.degrees(theta_val)

In [None]:
def find_best_match(count_frame, target):
    print('Finding best match...')
    match_dict = {}
    match_list = []
    counter = 0
    for i in count_frame.iterrows():
        print('At count_frame case:', counter)
        angle_btwn = vector_angle_calculator(i[1], target)
        #print('Angle calculated', angle_btwn)
        match_list.append(angle_btwn)
        match_dict[angle_btwn] = i[0]
        counter += 1
    print('Best match found.')
    return match_dict[min(match_list)]

In [10]:
def predict_author(train_values, test_values):
    count = 0
    predictions = []
    for i in test_values.iterrows():
        print('\n At case:', count, '\n')
        match = find_best_match(train_values, i)
        predictions.append(match)
        print('Match is:', match)
        #print('Real value is:', test_frame.index[i])
        count += 1
    return predictions

In [None]:
# Reindex by author and count
count_frame.index = training_set['written_by']
by_author = count_frame.groupby('written_by').aggregate(sum) 
print('Reindexed, count frame created')


In [None]:
def vector_angle_calculator(v1, v2, radians = True):
    # Calculate angle between two vectors
    v1_dot_v2 = sum([v1[i] * v2[i] for i in range(len(v1))])
    #print('Dot product is:', v1_dot_v2)
    mag_v1 = math.sqrt(sum([i**2 for i in v1]))
    #print('Magnitude of first vector is:', mag_v1)
    mag_v2 = math.sqrt(sum([i**2 for i in v2]))
    #print('Magnitude of second vector is:', mag_v1)
    angle_pre_acos = v1_dot_v2 / (mag_v1 * mag_v2)
    #print('this is value going into arcos', angle_pre_acos)
    if angle_pre_acos > 1:
        angle_pre_acos = 1.0
    if angle_pre_acos < -1:
        angle_pre_acos = -1.0
    theta_val = math.acos(angle_pre_acos)
    #print('Theta in radians is:', theta_val)
    #print('Theta in degrees is:', math.degrees(theta_val))
    if radians == True:
        return theta_val
    else:
        return math.degrees(theta_val)

In [None]:
def find_best_match(count_frame, target):
    print('Finding best match...')
    match_dict = {}
    match_list = []
    counter = 0
    for i in count_frame.iterrows():
        print('At count_frame case:', counter)
        angle_btwn = vector_angle_calculator(i[1], target)
        #print('Angle calculated', angle_btwn)
        match_list.append(angle_btwn)
        match_dict[angle_btwn] = i[0]
        counter += 1
    print('Best match found.')
    return match_dict[min(match_list)]

In [None]:
def predict_author(test_values):
    test_frame = pd.read_csv(str(work_dir + "/" + "test.csv")) # Read in the test data.
    # Get rid of words not in training data.
    
    count = 0
    predictions = []
    counted_predictions = collections.Counter()
    for i in range(np.shape(test_frame)[0]):
        print('\n At case:', count, '\n')
        match = find_best_match(count_frame, count_frame.iloc[i,:])
        predictions.append(match)
        print('Match is:', match)
        print('Real value is:', test_frame.index[i])
        count += 1


In [None]:
test_frame = pd.read_csv(str(work_dir + "/" + "test.csv")) # Read in the training data.

In [None]:
test_frame

In [None]:
hasher = FeatureHasher(input_type='string')
#hashed_text = hasher.transform(training_set['text'])
hashed_text_array = hashed_text.toarray()
hashed_text_array

In [None]:
training_set