# Readme

date: 08-11-2022 \
written by: Wan-Ting Yeh

## purpose of the script:

1. Walk through all the txt files in one folder
2. use Spacy (NLP) to:
    - tokenise the text 
    - clean the data (Exclude unwanted token, eg., punctuation)
    - lemmentisation (talked, talking --> talk)
    - custominsed lemmentisation (eg., peeeeeekaboo --> peekaboo)
    - count unique word / total word / type-token ratio
    - list part of word (noun, pronoun, adj...)
3. ouput file
    - OUTPUT_PATH_final: unique word
        - filename, unique words, total numbers of a file, total unique word counts, type-token ratio
    - OUTPUT_PATH_pos: part of speech
         - filename, original word, word lema, part of speech, explanation

### Step1: load library

In [29]:
import os
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")

### Step2: Specify constants

In [30]:
# Constants
FOLDER = "C:/Users/USER/PycharmProjects/UniqueWordCalculator/text_agent_condition/parent/test/"
OUTPUT_PATH_final = FOLDER + "/uniques_parent_spacy_unique_test.csv"
OUTPUT_PATH_pos = FOLDER + "/uniques_parent_spacy_pos.csv"

# final df output file format (count: unique words and total number words)
columns_final = ['filename', 'unique_words', 'total_number_words', 'unique_word_count', 'TTR']
final_df = pd.DataFrame(columns=columns_final)

# part of speech output file format
columns_pos = ['filename, word, word_lemma, word_pos, pos_explain']
pos_df = pd.DataFrame(columns=columns_pos)


In [31]:
# list of txt files in the directory

def list_text_files(folder, extension=".txt"):
    text_file_paths = []
    for root, dir, files in os.walk(folder):
        for file in files:
            if file.endswith(extension):
                text_file_paths.append(os.path.join(root, file))
    return text_file_paths

In [32]:
# check the files

text_file_paths = list_text_files(FOLDER)
print(f"path {text_file_paths}")

path ['C:/Users/USER/PycharmProjects/UniqueWordCalculator/text_agent_condition/parent/test/DIME002-CV.txt', 'C:/Users/USER/PycharmProjects/UniqueWordCalculator/text_agent_condition/parent/test/DIME002-DG.txt', 'C:/Users/USER/PycharmProjects/UniqueWordCalculator/text_agent_condition/parent/test/DIME004-CV.txt']


### Customised the lemmentisation

In [33]:
ar = nlp.get_pipe("attribute_ruler")
#syntax: ar.add([[{"TEXT":"Bro"}], [{"TEXT": "Brah"}]],{"LEMMA":"brother"})

ar.add([[{"TEXT": "cant"}]],{"LEMMA":"can't"})
ar.add([[{"TEXT":"aaahhh"}], [{"TEXT": "aaahh"}], [{"TEXT": "aaah"}], [{"TEXT": "ahhhh"}], [{"TEXT": "ahhh"}], [{"TEXT": "ahh"}], [{"TEXT": "aah"}], [{"TEXT": "aaaa"}], [{"TEXT": "aaa"}]], {"LEMMA":"ah"})
ar.add([[{"TEXT":"awhhhh"}], [{"TEXT": "awwww"}], [{"TEXT": "aww"}], [{"TEXT": "awh"}]],{"LEMMA":"aw"})
ar.add([[{"TEXT":"bahhh"}], [{"TEXT":"bahh"}]], {"LEMMA":"bah"})
ar.add([[{"TEXT":"boooeeehh"}], [{"TEXT": "booooo"}], [{"TEXT": "boooo"}], [{"TEXT": "booo"}]],{"LEMMA":"boo"})
ar.add([[{"TEXT":"blahalalala"}]],{"LEMMA":"blabla"})
ar.add([[{"TEXT":"bluey"}]],{"LEMMA":"blue"})
ar.add([[{"TEXT":"baba"}]],{"LEMMA":"bubba"})

### Loop through file

1. tokenise the data
2. lemmentise data with customised words
3. clean the data (no punctuation, space and \n)
4. **Exclude data**
    - interjection (eg., wow, whoa, yay)
    - punctuation
    - space
    - X (other)
    - symbol
    - manual: manual_exlcude
5. print out [word, word.lemma, word part of speech, explain part of speech]

In [34]:
# manually exclude words
manual_exclude = ["laugh", "gasp"," "]



for file_path in text_file_paths:
  with open(file_path, mode='r', encoding='cp1252') as text_file:
    file_name = os.path.basename(file_path)
    text = text_file.read()

    #cleaning texts
    text = text.lower()
    text = nlp(text)
    
    filter_token = []
    token_pos = []
    unique = []
    lines = []
    
    for word in text:
        if word.pos_ not in ["INTJ", "SPACE", "PUNCT", "X", "SYM"] and word.lemma_ not in manual_exclude:
            filter_token.append(word.lemma_)
            token_pos.append(word.pos_)
            line = [file_name, word, word.lemma_, word.pos_, spacy.explain(word.pos_)]
            lines.append(line)
            for token in filter_token:
                if token not in unique:
                    unique.append(token)
                    
        # numbers of words/ unique words
    total_num = len(text)
    total_unique = len(unique)
    TTR = total_unique/total_num
    
    # save output for final df
    data_row = pd.DataFrame(dict(zip(columns_final, [file_name, [unique], total_num, total_unique, TTR])))
    final_df = pd.concat([final_df, data_row], axis=0, ignore_index=True) 
    
    # save output for part of speech
    data_row_pos = pd.DataFrame(dict(zip(columns_pos, [lines])))
    pos_df  = pd.concat([pos_df, data_row_pos], axis=0, ignore_index=True) 

## save files

1. final_df = unique word file
    - filename, unique words, total numbers of a file, total unique word counts
2. pos_df = part of speech file
    - ['filename', 'original_word', 'word_lemma', 'word_pos']

In [36]:
print(total_num)
final_df.to_csv(OUTPUT_PATH_final)
final_df.head()

110


Unnamed: 0,filename,unique_words,total_number_words,unique_word_count,TTR
0,DIME002-CV.txt,"[it, is, go, play, peekaboo, this, be, we, to...",157,29,0.184713
1,DIME002-DG.txt,"[it, is, change, do, you, want, to, play, pee...",150,34,0.226667
2,DIME004-CV.txt,"[you, be, like, a, magician, make, they, disap...",110,37,0.336364


In [37]:
# print(token_pos)
pos_df.to_csv(OUTPUT_PATH_pos)
pos_df.head()

Unnamed: 0,filename| word| word_lemma| word_pos| pos_explain
0,"[DIME002-CV.txt, it, it, PRON, pronoun]"
1,"[DIME002-CV.txt, 's, is, AUX, auxiliary]"
2,"[DIME002-CV.txt, gone, go, VERB, verb]"
3,"[DIME002-CV.txt, play, play, VERB, verb]"
4,"[DIME002-CV.txt, peekaboo, peekaboo, NOUN, noun]"
