# Identifying author
The main purpose of this file is to identify the author of the test set using the sentences from the master file created.

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
verbose = False

In [3]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
stop_words = set(stopwords.words('english'))
from nltk.tokenize import RegexpTokenizer

## dfToCsv
This function will take in a data frame and a string representation of the file name.  It will write the data frame to file specified by the filename. .

In [4]:
def dfToCsv(df, filename, isIndexed = False):
    file = None
    targetFile = os.path.join(os.getcwd(), "data", "stopRemoved", filename)
    
    if(os.path.exists(targetFile)):
        file = openFile(targetFile, "a")
        df.to_csv(file, header = False, index = isIndexed)
        file.close()
    else:
        file = openFile(targetFile, "a")
        df.to_csv(file, index = isIndexed)
        file.close()
    # end dfToCsv

## openFile
This function tries to open the file, given a file name.  It takes in two parameters:  the file name and what mode to open in, by default for this project, the mode will be set to read mode.  It will return a file. 

In [5]:
def openFile (filename, mode = "r"):
        
    try:
        file = open(filename, mode)
        return file
                                        
    except IOError:
        print("An unexpected IO error occured in openFile.")
    except:
        print("An unexpected error occured in openFile.")
    
    #end textToVariable

Load files and create empty author column in test

In [6]:
currentPath = os.path.join(os.getcwd(), "data", "")
train = pd.read_csv(currentPath + "train.csv")
test = pd.read_csv(currentPath + "test.csv")
master = pd.read_csv(currentPath + "master.csv", index_col = 0)



In [7]:
for i, row in test.iterrows():

    # for each row... 
    sentence = test.iloc[i]["text"]
    tokenizer = RegexpTokenizer(r'\w+')
    word_tokens = tokenizer.tokenize(sentence)

    filtered_sentence = [] 

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
            
    
    filtered_sentence = ' '.join(word for word in filtered_sentence)
    test.at[i, "text"] = filtered_sentence
         
    # end for over test
    
dfToCsv(test, "stoppedTest.csv")

In [8]:
for i, row in train.iterrows():

    # for each row... 
    sentence = train.iloc[i]["text"]
    tokenizer = RegexpTokenizer(r'\w+')
    word_tokens = tokenizer.tokenize(sentence)

    filtered_sentence = [] 

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
            
    
    filtered_sentence = ' '.join(word for word in filtered_sentence)
    train.at[i, "text"] = filtered_sentence
         
    # end for over train
dfToCsv(train, "stoppedTrain.csv")

In [9]:
for i, row in master.iterrows():

    # for each row... 
    sentence = master.iloc[i]["text"]
    tokenizer = RegexpTokenizer(r'\w+')
    word_tokens = tokenizer.tokenize(sentence)

    filtered_sentence = [] 

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 
            
    
    filtered_sentence = ' '.join(word for word in filtered_sentence)
    master.at[i, "text"] = filtered_sentence
         
    # end for over train
dfToCsv(master, "stoppedMaster.csv")

In [10]:
print("\ncounting how many rows are labeled and empty\n")

print(test.count())
labeledTest = test.copy()
print(labeledTest)

# labeledTest.dropna()
# labeledTest.mask(labeledTest.eq("None")).dropna()
labeledTest[labeledTest.astype(str).ne("None").all(1)]

if(verbose):
    print("dropped n/a")
    print(labeledTest)



counting how many rows are labeled and empty

id      8392
text    8392
dtype: int64
           id                                               text
0     id02310  Still I urged leaving Ireland inquietude impat...
1     id24541  If fire wanted fanning could readily fanned ne...
2     id00134  And broken frail door found two cleanly picked...
3     id27757  While I thinking I possibly manage without one...
4     id04081                  I sure limit knowledge may extend
5     id27337  The thick peculiar mist smoke distinguishes In...
6     id24265                That matter unless qualities things
7     id25917  I sought repose although I hope forgetfulness ...
8     id04951  Upon fourth day assassination party police cam...
9     id14549                The tone metaphysical also good one
10    id22505  These offspring later period stood erect seeme...
11    id24002  What kept going Brown Jenkin throne Chaos thin...
12    id18982  Persuading widow connexion husband technical m...
13  

In [11]:
dfToCsv(labeledTest, "FullStopLabeled.csv")

Removing rows that don't have something in the author column. 

In [12]:
labeledTest.isna()
droppedLabel = labeledTest.dropna()


In [13]:
dfToCsv(droppedLabel, "DroppedStopLabeled.csv")

To-do:
* ~~load test~~
* ~~load master~~
* ~~create column~~
* ~~iterate through test~~
    * search for exact match of text
    * add author to entry
* Check how many are unidentified
* Write identified entries to .csv file
