# Converting texts into CSV files.
This file is responsible for taking .txt files of the story and splitting them into separate sentences.  To remove labels, manually, the header of the file will be removed.  For each sentence found, sentences with 2 or fewer words will be removed.  Assuming it will lines like 'Chapter II'.  And removing all whitespace that is not a space. 

In the menus, it assumes the user will follow the prompt as indicated.  There is no check to ensure the user is entering valid information at this time. 

The assumed folder structure for this program is that this file is in the current working directory.  In the same directory, there is a folder named data, inside of data there are two folders Books and author.  Inside Books, are folders MWS, EAP, HPL which contains .txt files of the author's work.  

In [1]:
import nltk.data
import pandas as pd
import os
import glob

sent_dector = nltk.data.load('tokenizers/punkt/english.pickle')

In [2]:
isTesting = False

## openFile
This function tries to open the file, given a file name.  It takes in two parameters:  the file name and what mode to open in, by default for this project, the mode will be set to read mode.  It will return a file. 

In [3]:
def openFile (filename, mode = "r"):
    if(isTesting):
        print("openFile: ", filename,"\n")
        
    try:
        file = open(filename, mode)
        return file
                                        
    except IOError:
        print("An unexpected IO error occured in openFile.")
    except:
        print("An unexpected error occured in openFile.")
    
    #end textToVariable

## fileToLines
This function will take in the opened file and return a the text from the file as a single string. Replacing line breaks with spaces, and removing multiple spaces.  

In [4]:
def fileToLines(file):
    lines = file.read().replace("\n", " ")
    lines = " ".join(lines.split())
    file.close()
    return lines
    #end fileToLines

## splitBySentence
This function will take a string and return a list of the string split into sentence. Removing the leading and trailing spaces. 

In [5]:
def splitBySentence(lines):
    if(isTesting):
        print("\nin  splitBySentence\n")
    tokens = sent_dector.tokenize(lines.strip())
    
    return tokens
    #end splitBySentence

## listToDf
This function will take in a list of tokens and put it into a pandas data frame and append the list of tokens to the original data frame. Returning the original data frame with the new text appended.

In [6]:
def listToDf(tokens, original):
    if(isTesting):
        print("\nin listToDf \n")
    df = pd.DataFrame(tokens)
    df.columns = ["text"]
    
    if(original.empty):
        original = df
    else:
        original.append(df, ignore_index = True)
        
        
    return original
    #listToDf

## removeShort
This function will take in a data frame, and will remove sentences with the minimum number of words in the text field.  The variable min is set to 3 by default, unless the function is called with a different value specified. It will return the data frame.  This is done to help remove chapter labels.  

In [7]:
def removeShort(dataFrame, min = 3):
    
    if(isTesting):
        print("\nin removeShort\n")
        print(dataFrame)
    
    count = dataFrame["text"].str.split().str.len()
    ~(count == min)
    if(isTesting):
        print("count is: \n", count)
    
    dropped = dataFrame[~(count <= min)].copy()
    dropped = dropped.reset_index(drop = True)
    if(isTesting):
        print("\nafter dropped")
        print(dropped)
        
    return dropped
    # end removeShort

In [8]:
# if(isTesting):
#     data = pd.DataFrame({'text': ['hello my name is','hey', 'hello world', 'help me please!']})
#     removeShort(data)

## addAuthor
This function will take in a data frame and the string representation of the author, in this project MWS, EAP, HPL.  It will return a dataframe with an additional autorh 

In [9]:
def addAuthor(df, author):
    if(isTesting):
        print("\n \n")
    df["author"] = author
    return df
    # end addAuthor

## getPath
This function will take in a list of stings that represent the folders from the current working directory.  It will return a string representation of the path to a file or where a file should be created. 

In [10]:
def getPath(folderSystem, path = os.getcwd()):
    for string in folderSystem:
        path = os.path.join(path, string)
    
    if(isTesting):
        print("getPath built:", path)
        
    return path
    # end getPath

## dfToCsv
This function will take in a data frame and a string representation of the file name.  It will write the data frame to file specified by the filename. .

In [11]:
def dfToCsv(df, filename):
    if(isTesting):
        print("\nin difToCsv \n")
    file = None
    
    if(isTesting):
        print("file path: ", filename)
    
    if(os.path.exists(filename)):
        file = openFile(filename, "a")
        df.to_csv(file, header = False, index = False)
        file.close()
    else:
        file = openFile(filename, "a")
        df.to_csv( file, index = False)
        file.close()
    # end dfToCsv

## readAndCreate
This function will prompt the use through the process of reading in a .txt file and saving it as a .csv file. Assuming the .txt files are located in data/Books/[author initial] 

In [12]:
def readAndCreate():
    if(isTesting):
        print("\nin readAndCreate\n")
        
    
    
    # get folder name
    print("Enter author's name (MWS, EAP, HPL)")
    author = input()
    
    folderPath = os.path.join(os.getcwd(), "data", "Books", author)
    
    #setup loop to go through files
    #for filename in os.listdir(folderPath):
    for filename in glob.glob(os.path.join(folderPath, '*.txt')):
        if(isTesting):
            print("List is:", os.path.join(folderPath, '*.txt'))
        
        currentDf = pd.DataFrame()
        #open file
        currentFile = openFile(filename)
        #create string of text
        lines = fileToLines(currentFile)
        #tokenize string
        tokens = splitBySentence(lines)
        #list -> df
        currentDf = listToDf(tokens, currentDf)
        #end filename loop
    
        #done with all files
        #delete short sentences
        currentDf = removeShort(currentDf)
        #label author
        addAuthor(currentDf, author)
        #write to csv
        csvName = author + ".csv"
        folderList = ["data", "author", csvName]
        folderPath = getPath(folderList)
        dfToCsv(currentDf, folderPath)
    
# readAndCreate()

## merge()
This function will merge the number of files specified in the parameters.  The files to be merged have been specified by the list of file names specified in the list of the second parameter.

In [13]:
def merge(number, listOfFiles):
    listOfDf = []
    for i in range(number):
        listOfDf.append(pd.read_csv(listOfFiles[i]))
        # end for
    merged = pd.concat(listOfDf, ignore_index = True)
    return merged
    # end merge()

## getMergeInfo()
This function will prompt users for more information an

In [26]:
def getMergeInfo():
    print("How many files do you want to merge? ")
    fileCount = int(input())
    
    print("\nAre the files the .csv files in the data/author folder? (y / any other key)")
    isAssumedFiles = input()
        
    listOfFile = []
    for number in range(0,fileCount):
        if (isAssumedFiles == 'y'):
            
            print("Which .csv file in data/author do you want to merge (please don't include .csv)?")
            name = input()
            fullPath = ["data", "author", name + ".csv"]
            listOfFile.append(getPath(fullPath))
        else:
            print("Please enter the full path of the file you want to merge: ")
            filePath = input()
            listOfFile.append(filePath)
        # end for loop

    master = merge(fileCount, listOfFile)
    # master.insert(0, "id", range(1, len(master)+1))
    if(isTesting):
        print("master is: ",master)
    print("Wrtting merged file to master.csv in the data folder")
    
    mergeFolder = ["data", "master.csv"]
    mergePath = getPath(mergeFolder)
    dfToCsv(master, mergePath)
    
    # end getMergeInfo()

In [28]:
def classifyAuthor():
    testPath = ["data", "test.csv"]
    masterPath = ["data", "master.csv"]
    test = pd.read_csv(getPath(testPath))
    master = pd.read_csv(getPath(masterPath))
    
    test["author"] = None
    
    for i, row in test.iterrows():

        # for each row... 
        current = test.iloc[i]
        result = master[master["text"].str.match(current["text"])]
        countOfResult = result["text"].count()

        if(countOfResult == 1):
            author = result.iloc[0]["author"]
            test.at[i, "author"] = author
        elif (countOfResult == 0):
            if(isTesting):
                print("\ntext not in master\n")
        else:
            if(isTesting):
                print("\nMore than one match\n")

        if(isTesting):
            print(test.iloc[i])
         
    # end for over test
    
    print("\ncounting how many rows are labeled and empty\n")

    print(test.count())
    labeledTest = test.copy()
    print(labeledTest.head(5))
    
    masterPath = ["data", ""]
    masterPath = getPath(masterPath)

    dfToCsv(labeledTest, masterPath + "fullLabeledTest.csv")

    print("dropped n/a")
    labeledTest[labeledTest.astype(str).ne("None").all(1)]
    if(isTesting):
        print(labeledTest)
    print(labeledTest.head(5))
    labeledTest.isna()
    droppedLabel = labeledTest.dropna()
    
    dfToCsv(droppedLabel, masterPath + "labeledTest.csv")

## printMenuOptions
This function will print out the menu options.

In [16]:
def printMenuOptions():
    print()
    print("0 - Exit program")
    print("1 - Read .txt file and save to .csv for folder")
    print("2 - Combine MWS.csv, EAP.csv, and HPL.csv files")
    print("3 - Use master.csv to match author")
    print()
    # end printMenuOptions

## menu
This function starts the converting process.  It will display menus, and allow the user to specify which files to convert into .csv files.  It also gives users the ability to combine different csv fies 

In [29]:
def menu():
    choice = 9
    
    while choice != 0:
        printMenuOptions()
        print("Make a selection:")
        choice = input()
        choice = int(choice)
        if choice == 1:
            readAndCreate()
            # end choice =1
        elif choice == 2:
            getMergeInfo()
            # end choice == 2
        elif choice == 3:
            classifyAuthor()
            # end choice == 3
        elif choice == 0:
            print("Ending program")
            # end choice == 0
        else:
            print(choice, "is not a valid choice.\n")
        # end choice
    # end menu
menu()


0 - Exit program
1 - Read .txt file and save to .csv for folder
2 - Combine MWS.csv, EAP.csv, and HPL.csv files
3 - Use master.csv to match author

Make a selection:


 3



More than one match


More than one match


More than one match


More than one match


More than one match


More than one match


counting how many rows are labeled and empty

id        8392
text      8392
author    4213
dtype: int64
           id                                               text author
0     id02310  Still, as I urged our leaving Ireland with suc...    MWS
1     id24541  If a fire wanted fanning, it could readily be ...   None
2     id00134  And when they had broken down the frail door t...    HPL
3     id27757  While I was thinking how I should possibly man...   None
4     id04081  I am not sure to what limit his knowledge may ...    EAP
5     id27337  "The thick and peculiar mist, or smoke, which ...   None
6     id24265  That which is not matter, is not at all unless...   None
7     id25917  I sought for repose although I did not hope fo...    MWS
8     id04951  Upon the fourth day of the assassination, a pa...    EAP
9     id14549         "The tone metaphysica

 0


Ending program


In [None]:
# if(isTesting):
#     targetddFolder = os.path.join(os.getcwd(), "data", "")
#     listOfFile = [targetFolder + "MWS.csv", targetFolder + "EAP.csv", targetFolder + "HPL.csv"]
#     print(listOfFile)

In [None]:
# if(isTesting):
#     print(os.getcwd())
#     currentBook = "Berenice.txt"
#     print(os.path.join(os.getcwd(), "data", "Books", "EAP", currentBook))


Python path help: https://automatetheboringstuff.com/chapter8/
Getting the working directory path: https://stackoverflow.com/questions/3430372/how-to-get-full-path-of-current-files-directory-in-python