# Converting texts into CSV files.
This file is responsible for taking .txt files of the story and splitting them into seperate sentences.  To remove labels, manually, the header of the file will be removed.  For each sentence found, sentences with 2 or fewer words will be reomved.  Assuming it will lines like 'Chapter II'.  And removing all whitespace that is not a space. 

In [1]:
import nltk.data
import pandas as pd
import os
import glob

sent_dector = nltk.data.load('tokenizers/punkt/english.pickle')

In [2]:
isTesting = True

## openFile
This function tries to open the file, given a file name.  It takes in two parameters:  the file name and what mode to open in, by default for this project, the mode will be set to read mode.  It will return a file. 

In [3]:
def openFile (filename, mode = "r"):
    if(isTesting):
        print("openFile: ", filename,"\n")
        
    try:
        file = open(filename, mode)
        return file
                                        
    except IOError:
        print("An unexpected IO error occured in openFile.")
    except:
        print("An unexpected error occured in openFile.")
    
    #end textToVariable

## fileToLines
This function will take in the opened file and return a the text from the file as a single string. Replacing line breaks with spaces, and removing multiple spaces.  

In [4]:
def fileToLines(file):
    lines = file.read().replace("\n", " ")
    " ".join(lines.split())
    file.close()
    return lines
    #end fileToLines

In [5]:
# # reference!
# def openFile (filename, mode = "r"):
#     if(isTesting):
#         print("openFile: ", filename,"\n")
#     lines = None
    
#     try:
#         with open(filename, mode) as file:
#             lines = file.read().replace('\n', ' ')
                                        
#     except IOError:
#         print("An unexpected IO error occured in openFile.")
#     except:
#         print("An unexpected error occured in openFile.")
    
#     return lines
#     #end textToVariable

## splitBySentence
This function will take a string and return a list of the string split into sentence. Removing the leading and trailing spaces. 

In [6]:
def splitBySentence(lines):
    if(isTesting):
        print("\nin  splitBySentence\n")
    tokens = sent_dector.tokenize(lines.strip())
    
    return tokens
    #end splitBySentence

## listToDf
This function will take in a list of tokens and put it into a pandas data frame and append the list of tokens to the original data frame. Returning the original data frame with the new text appended.

In [7]:
def listToDf(tokens, original):
    if(isTesting):
        print("\nin listToDf \n")
    df = pd.DataFrame(tokens)
    df.columns = ["text"]
    
    if(original.empty):
        original = df
    else:
        original.append(df, ignore_index = True)
        
        
    return original
    #listToDf

## removeShort
This function will take in a data frame, and will remove sentences with the minimum number of words in the text field.  The variable min is set to 3 by default, unless the function is called with a different value specified. It will return the data frame.  This is done to help remove chapter labels.  

In [8]:
def removeShort(dataFrame, min = 3):
    
    if(isTesting):
        print("\nin removeShort\n")
        print(dataFrame)
    
    count = dataFrame["text"].str.split().str.len()
    ~(count == min)
    if(isTesting):
        print("count is: \n", count)
    
    dropped = dataFrame[~(count <= min)].copy()
    dropped = dropped.reset_index(drop = True)
    if(isTesting):
        print("\nafter dropped")
        print(dropped)
        
    return dropped
    # end removeShort

In [9]:
# if(isTesting):
#     data = pd.DataFrame({'text': ['hello my name is','hey', 'hello world', 'help me please!']})
#     removeShort(data)

## addAuthor
This function will take in a data frame and the string representation of the author, in this project MWS, EAP, HPL.  It will return a dataframe with an additional autorh 

In [10]:
def addAuthor(df, author):
    if(isTesting):
        print("\n \n")
    df["author"] = author
    return df
    # end addAuthor

## dfToCsv
This function will take in a data frame and a string representation of the file name.  It will write the data frame to file specified by the filename. .

In [18]:
def dfToCsv(df, filename):
    if(isTesting):
        print("\nin difToCsv \n")
    file = None
    targetFile = os.path.join(os.getcwd(), "data", filename)
    
    if(isTesting):
        print("targetFile: ", targetFile)
    
    if(os.path.exists(targetFile)):
        file = openFile(targetFile, "a")
        df.to_csv(file, header = False)
        file.close()
    else:
        file = openFile(targetFile, "a+")
        df.to_csv(file)
        file.close()
    # end dfToCsv

## readAndCreate
This function will prompt the use through the process of reading in a .txt file and saving it as a .csv file. Assuming the .txt files are located in data/Books/[author initial] 

In [12]:
def readAndCreate():
    if(isTesting):
        print("\nin readAndCreate\n")
        
    currentDf = pd.DataFrame()
    
    # get folder name
    print("Enter author's name (MWS, EAP, HPL)")
    author = input()
    
    folderPath = os.path.join(os.getcwd(), "data", "Books", author)
    
    #setup loop to go through files
    #for filename in os.listdir(folderPath):
    for filename in glob.glob(os.path.join(folderPath, '*.txt')):
        #open file
        currentFile = openFile(filename)
        #create string of text
        lines = fileToLines(currentFile)
        #tokenize string
        tokens = splitBySentence(lines)
        #list -> df
        currentDf = listToDf(tokens, currentDf)
        #end filename loop
    
    #done with all files
    #delete short sentences
    currentDf = removeShort(currentDf)
    #label author
    addAuthor(currentDf, author)
    #write to csv
    csvName = author + ".csv"
    dfToCsv(currentDf, csvName)
    
    

## printMenuOptions
This function will print out the menu options.

In [13]:
def printMenuOptions():
    print("0 - Exit program")
    print("1 - Read .txt file and save to .csv for folder")
    # print("2 - Read file - specify folder path")
    print("3 - Combine .csv files")
    # end printMenuOptions

## menu
This function starts the converting process.  It will display menus, and allow the user to specify which files to convert into .csv files.  It also gives users the ability to combine different csv fies 

In [19]:
def menu():
    choice = 9
    
    while choice != 0:
        print("Make a selection:")
        choice = input()
        choice = int(choice)
        if choice == 1:
            readAndCreate()
            # end choice =1
        elif choice == 2:
            print("In 2")
            # end choice == 2
        elif choice == 3:
            print("In 3")
            # end choice == 3
        elif choice == 0:
            print("Ending program")
            # end choice == 0
        else:
            print(choice, "is not a valid choice.\n")
        # end choice
    # end menu
menu()

Make a selection:


 1



in readAndCreate

Enter author's name (MWS, EAP, HPL)


 MWS


openFile:  /Users/veronica/Documents/SearchEngines/SpookyAuthor/SA/CSC849_SpookyAuthor/data/Books/MWS/Frankenstein.txt 


in  splitBySentence


in listToDf 


in removeShort

                                                   text
0     Letter 1  _To Mrs. Saville, England._   St. Pe...
1     You will rejoice to hear that no disaster has ...
2     I arrived here yesterday, and my first task is...
3     I am already far north of London, and as I wal...
4                       Do you understand this feeling?
5     This breeze, which has travelled from the regi...
6     Inspirited by this wind of promise, my daydrea...
7     I try in vain to be persuaded that the pole is...
8     There, Margaret, the sun is for ever visible, ...
9     There—for with your leave, my sister, I will p...
10    Its productions and features may be without ex...
11    What may not be expected in a country of etern...
12    I may there discover the wondrous power which ...
13    I shall satiate my ardent curiosity

 0


Ending program


In [15]:

print(os.getcwd())
currentBook = "Berenice.txt"
print(os.path.join(os.getcwd(), "data", "Books", "EAP", currentBook))


/Users/veronica/Documents/SearchEngines/SpookyAuthor/SA/CSC849_SpookyAuthor
/Users/veronica/Documents/SearchEngines/SpookyAuthor/SA/CSC849_SpookyAuthor/data/Books/EAP/Berenice.txt


Python path help: https://automatetheboringstuff.com/chapter8/
Getting the working directory path: https://stackoverflow.com/questions/3430372/how-to-get-full-path-of-current-files-directory-in-python