# Machine Learning Part 1: Data Importation & Processing

Worked on by Vionna Atefi

In this part of the assignment I will work on importing the testing data, 
extracting the correct features, and turning our data into a vector that can later be used 
by a machine learning classifier. 

In [2]:
def add_to_corpus(corpus, list_of_author_files):
        
    corpus = set() # make corpus a set to avoid duplicates

    for author_filename in list_of_author_files:
        with open (author_filename, 'r') as file:
            all_txt = file.read()
            all_txt = all_txt.lower()
            # remove quotation marks and punctuation
            all_txt = all_txt.replace('"', '')
            all_txt = all_txt.replace('?', '')
            all_txt = all_txt.replace('!', '')
            all_txt = all_txt.replace('.', '')
            all_txt = all_txt.replace(',', '')
            all_txt = all_txt.replace(';', '')
            all_txt = all_txt.replace('(', '')
            all_txt = all_txt.replace(')', '')
            all_txt = all_txt.replace('-', '')
            list_of_words = all_txt.split() # separates based on spaces
            
            for word in list_of_words: 
                corpus.add(word)          
    
    return sorted(corpus) # returns a list

In [35]:
NUM_CISWOMEN = 3
NUM_CISMEN = 3
NUM_TRANSWOMEN = 3
NUM_TRANSMEN = 1
NUM_NONBINARY = 0

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

array_of_text = []

list_of_author_files = ["Austen.txt", "CharlotteBronte.txt", "Shelley.txt", "Dickens.txt", "Hemingway.txt", "Kerouac.txt", "CaseyPlett.txt", "CharlieAnders.txt", "ImogenBinnie.txt", "ElliotDeline.txt"]

for author_filename in list_of_author_files:
    with open (author_filename, 'r') as file:
        all_txt = file.read()
        array_of_text.append(all_txt)

count_vect = CountVectorizer()
count_matrix = count_vect.fit_transform(array_of_text)
count_matrix = count_matrix.toarray()

# create a gender identity specific dictionaries 
# key = eg. Austen.txt (str)
# vale = [0 0 0 ... 2 3 0] (list of integers)

cisWomen = {}
cisMen = {}
transWomen = {}
transMen = {} # NOTE: I don't know if there's a comparable number of this within fiction yet
nonBinary = {} # NOTE: Also don't know if there are enough nonBinary authors to train a classifier with this yet

# add correct arrays to each dictionary
# for loop will automatically go up to end_of_range - 1
# count_matrix[0] -> count_matrix[NUM_CISWOMEN - 1] = cis women
# count_matrix[NUM_CISWOMEN] -> count_matrix[(NUM_CISWOMEN + NUM_CISMEN) - 1] = cis men
# count_matrix[NUM_CISWOMEN + NUM_CISMEN] -> count_matrix[(NUM_CISWOMEN + NUM_CISMEN + NUM_TRANSWOMEN) - 1] = transwomen
# count_matrix[NUM_CISWOMEN + NUM_CISMEN + NUM_TRANSWOMEN] -> count_matrix[(NUM_CISWOMEN + NUM_CISMEN + NUM_TRANSWOMEN + NUM_TRANSMEN) - 1] = transmen
# count_matrix[NUM_CISWOMEN + NUM_CISMEN + NUM_TRANSWOMEN + NUM_TRANSMEN] -> count_matrix[(NUM_CISWOMEN + NUM_CISMEN + NUM_TRANSWOMEN + NUM_TRANSMEN + NUM_NONBINARY) - 1]

# should looop over the entirety of the list_of_author_files

for i in range(NUM_CISWOMEN):
    cisWomen[list_of_author_files[i]] = count_matrix[i].tolist(); # turns np.array() into a list
    
for i in range(NUM_CISWOMEN, NUM_CISWOMEN + NUM_CISMEN):
    cisMen[list_of_author_files[i]] = count_matrix[i].tolist();

for i in range(NUM_CISWOMEN + NUM_CISMEN, NUM_CISWOMEN + NUM_CISMEN + NUM_TRANSWOMEN):
    transWomen[list_of_author_files[i]] = count_matrix[i].tolist();

for i in range(NUM_CISWOMEN + NUM_CISMEN + NUM_TRANSWOMEN, NUM_CISWOMEN + NUM_CISMEN + NUM_TRANSWOMEN + NUM_TRANSMEN):
    transMen[list_of_author_files[i]] = count_matrix[i].tolist();
    
# for i in range(NUM_CISWOMEN + NUM_CISMEN + NUM_TRANSWOMEN + NUM_TRANSMEN, NUM_CISWOMEN + NUM_CISMEN + NUM_TRANSWOMEN + NUM_TRANSMEN + NUM_NONBINARY):

print(cisWomen)
print(cisMen)
print(transWomen)
print(transMen)

{'Austen.txt': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 17, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0