# Readme

date: 08-11-2022 \
written by: Wan-Ting Yeh

## purpose of the script:

1. Walk through all the txt files in one folder
2. use Spacy (NLP) to:
    - clean the data (Exclude unwanted token, eg., punctuation)
    - lemmentisation (talked, talking --> talk)
    - custominsed lemmentisation (eg., peeeeeekaboo --> peekaboo)
    - count unique word / total word
    - list part of word (noun, pronoun, adj...)
3. ouput file
    - OUTPUT_PATH_final: unique word
        - filename, unique words, total numbers of a file, total unique word counts, type-token ratio
    - OUTPUT_PATH_pos: part of speech
         - filename, original word, word lema, part of speech, explanation

### Step1: load library

In [1]:
import os
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")

### Step2: Specify constants

In [2]:
# Constants
FOLDER = "C:/Users/USER/PycharmProjects/UniqueWordCalculator/parent-condition/pooling-data/wo_interjections/"
OUTPUT_PATH_final = FOLDER + "uniques_parent_UNIQUE_pooling_woint.csv"
OUTPUT_PATH_pos = FOLDER + "uniques_parent_POS_pooling_woint.csv"
CONDITION = "woint"

# final df output file format (count: unique words and total number words)
columns_final = ['filename', 'unique_words', 'total_number_words', 'unique_word_count', 'TTR']
final_df = pd.DataFrame(columns=columns_final)

# part of speech output file format
columns_pos = ['filename, word, word_lemma, word_pos, pos_explain']
pos_df = pd.DataFrame(columns=columns_pos)


In [3]:
# list of txt files in the directory

def list_text_files(folder, extension=".txt"):
    text_file_paths = []
    for root, dir, files in os.walk(folder):
        for file in files:
            if file.endswith(extension):
                text_file_paths.append(os.path.join(root, file))
    return text_file_paths

In [4]:
# check the files

text_file_paths = list_text_files(FOLDER)
print(f"path {text_file_paths}")

path ['C:/Users/USER/PycharmProjects/UniqueWordCalculator/parent-condition/pooling-data/wo_interjections/0_NC_clean_pooling.txt', 'C:/Users/USER/PycharmProjects/UniqueWordCalculator/parent-condition/pooling-data/wo_interjections/1_C_clean_pooling.txt', 'C:/Users/USER/PycharmProjects/UniqueWordCalculator/parent-condition/pooling-data/wo_interjections/clean_all_text.txt', 'C:/Users/USER/PycharmProjects/UniqueWordCalculator/parent-condition/pooling-data/wo_interjections/CV-0_clean_overall_contigent.txt', 'C:/Users/USER/PycharmProjects/UniqueWordCalculator/parent-condition/pooling-data/wo_interjections/CV-1_clean_overall_contigent.txt', 'C:/Users/USER/PycharmProjects/UniqueWordCalculator/parent-condition/pooling-data/wo_interjections/CV_clean_pooling.txt', 'C:/Users/USER/PycharmProjects/UniqueWordCalculator/parent-condition/pooling-data/wo_interjections/DG-0_clean_overall_contigent.txt', 'C:/Users/USER/PycharmProjects/UniqueWordCalculator/parent-condition/pooling-data/wo_interjections/DG-1

### Customised the lemmentisation

In [5]:
ar = nlp.get_pipe("attribute_ruler")
#syntax: ar.add([[{"TEXT":"Bro"}], [{"TEXT": "Brah"}]],{"LEMMA":"brother"})

ar.add([[{"TEXT":"let's"}]],{"LEMMA":"let us"})
ar.add([[{"TEXT":"darle"}], [{"TEXT":"darling"}]],{"LEMMA":"darling"})
ar.add([[{"TEXT":"is"}]],{"LEMMA":"be"})
ar.add([[{"TEXT":"cant"}]],{"LEMMA":"can't"})
ar.add([[{"TEXT":"aaahhh"}], [{"TEXT": "aaahh"}], [{"TEXT": "aaah"}], [{"TEXT": "ahhhh"}], [{"TEXT": "ahhh"}], [{"TEXT": "ahh"}], [{"TEXT": "aahh"}],[{"TEXT": "aah"}], [{"TEXT": "aaaa"}], [{"TEXT": "aaa"}]], {"LEMMA":"ah"})
ar.add([[{"TEXT":"awhhhh"}], [{"TEXT": "awwww"}], [{"TEXT": "aww"}], [{"TEXT": "awww"}],[{"TEXT": "awh"}]],{"LEMMA":"aw"})
ar.add([[{"TEXT":"ay"}], [{"TEXT":"ey"}]],{"LEMMA":"aye"})
ar.add([[{"TEXT":"bahhh"}], [{"TEXT":"bahh"}], [{"TEXT":"bahhhh"}]], {"LEMMA":"bah"})
ar.add([[{"TEXT":"boooeeehh"}], [{"TEXT": "booooo"}], [{"TEXT": "boooo"}], [{"TEXT": "booo"}]],{"LEMMA":"boo"})
ar.add([[{"TEXT":"blahalalala"}],[{"TEXT": "blahhh"}]],{"LEMMA":"blabla"})
ar.add([[{"TEXT":"bluey"}]],{"LEMMA":"blue"})
ar.add([[{"TEXT":"baba"}], [{"TEXT":"bub"}]],{"LEMMA":"bubba"})
ar.add([[{"TEXT":"child name"}], [{"TEXT": "childs name"}], [{"TEXT": "childsname"}]],{"LEMMA":"childname"})
ar.add([[{"TEXT":"dada"}], [{"TEXT": "daddy"}], [{"TEXT": "dad"}]],{"LEMMA":"father"})
ar.add([[{"TEXT":"doe"}]],{"LEMMA":"do"})
ar.add([[{"TEXT":"doggy"}], [{"TEXT": "doggies"}]],{"LEMMA":"dog"})
ar.add([[{"TEXT":"duckies"}], [{"TEXT": "ducky"}]],{"LEMMA":"duck"})
ar.add([[{"TEXT":"e"}], [{"TEXT": "eee"}]],{"LEMMA":"ee"})
ar.add([[{"TEXT":"gunna"}], [{"TEXT": "gonna"}]],{"LEMMA":"go to"})
ar.add([[{"TEXT":"gonnne"}], [{"TEXT": "gooone"}], [{"TEXT": "gooo"}], [{"TEXT": "gone"}], [{"TEXT": "gon"}], [{"TEXT": "going"}]],{"LEMMA":"go"})
ar.add([[{"TEXT":"hideee"}], [{"TEXT": "hidey"}], [{"TEXT": "hiding"}]],{"LEMMA":"hide"})
ar.add([[{"TEXT":"hiiii"}], [{"TEXT": "hiii"}], [{"TEXT": "hii"}], [{"TEXT": "hai"}]],{"LEMMA":"hi"})
ar.add([[{"TEXT":"heyyy"}], [{"TEXT": "heyy"}]],{"LEMMA":"hey"})
ar.add([[{"TEXT":"hellooo"}], [{"TEXT": "helloo"}]],{"LEMMA":"hello"})
ar.add([[{"TEXT":"jooob"}]],{"LEMMA":"job"})
ar.add([[{"TEXT":"xxx"}], [{"TEXT": "xx"}]],{"LEMMA":"kiss"})
ar.add([[{"TEXT":"meee"}], [{"TEXT": "mee"}]],{"LEMMA":"me"})
ar.add([[{"TEXT":"mummyyyy"}], [{"TEXT": "mummyyy"}], [{"TEXT": "mummys"}], [{"TEXT": "ma"}],[{"TEXT": "muuummy"}], [{"TEXT": "x0019_mummy"}], [{"TEXT": "muummy"}], [{"TEXT": "mu"}],[{"TEXT": "mummy"}], [{"TEXT": "mama"}], [{"TEXT": "mama"}], [{"TEXT": "mumma"}], [{"TEXT": "muma"}], [{"TEXT": "mu-"}], [{"TEXT": "mum"}]],{"LEMMA":"mother"})
ar.add([[{"TEXT":"mmmm"}], [{"TEXT": "mmm"}], [{"TEXT":"m."}],[{"TEXT":"m"}]],{"LEMMA":"mm"})
ar.add([[{"TEXT":"naaw"}], [{"TEXT":"naaaw"}]],{"LEMMA":"naw"})
ar.add([[{"TEXT":"nooo"}], [{"TEXT": "nono"}], [{"TEXT": "noho"}]],{"LEMMA":"no"})
ar.add([[{"TEXT":"ooooooooh"}], [{"TEXT": "ooh"}], [{"TEXT": "ohhh"}], [{"TEXT": "ohh"}]],{"LEMMA":"oh"})
ar.add([[{"TEXT":"okay"}],[{"TEXT":"k"}],[{"TEXT":"ok"}]], {"LEMMA":"okay"})
ar.add([[{"TEXT":"oooo"}], [{"TEXT": "ooo"}]],{"LEMMA":"oo"})
ar.add([[{"TEXT":"oop"}], [{"TEXT": "ooph"}], [{"TEXT": "ooop"}]],{"LEMMA":"oops"})
ar.add([[{"TEXT":"oyeee"}]],{"LEMMA":"oye"})
ar.add([[{"TEXT":"peeee"}], [{"TEXT": "peeek"}], [{"TEXT": "peekk"}], [{"TEXT": "pee"}]],{"LEMMA":"peek"})
ar.add([[{"TEXT":"peeeekaaa"}], [{"TEXT": "peeekaaa"}], [{"TEXT": "peekaaaa"}],[{"TEXT": "peeka-"}], [{"TEXT": "peekaaaaa"}],[{"TEXT": "peeekaaaa"}],[{"TEXT": "peekab"}], [{"TEXT": "peekaaa"}], [{"TEXT": "peakkaaaa"}], [{"TEXT": "peekkkaa"}], [{"TEXT": "peakkaa"}], [{"TEXT": "peekkaa"}], [{"TEXT": "peekkab"}], [{"TEXT": "peepo"}], [{"TEXT": "bebo"}], [{"TEXT": "peekka"}]],{"LEMMA":"peeka"})
ar.add([[{"TEXT":"peekahboooo"}], [{"TEXT": "peekaboooo"}], [{"TEXT": "peekabooo"}], [{"TEXT": "peekkaboo"}], [{"TEXT": "peeaboo"}], [{"TEXT": "peek-a-boo"}], [{"TEXT": "peekaboo^"}],[{"TEXT": "peeakboo"}], [{"TEXT": "peekabooooo"}]],{"LEMMA":"peekaboo"})
ar.add([[{"TEXT":"piggy"}]],{"LEMMA":"pig"})
ar.add([[{"TEXT":"playng"}], [{"TEXT": "playin"}],[{"TEXT": "play-"}]],{"LEMMA":"play"})
ar.add([[{"TEXT":"reaadyyyy"}], [{"TEXT": "reaaady"}], [{"TEXT": "readyyy"}]],{"LEMMA":"ready"})
ar.add([[{"TEXT":"sheepy"}]],{"LEMMA":"sheep"})
ar.add([[{"TEXT":"thereeee"}], [{"TEXT": "thereee"}], [{"TEXT": "theres"}]],{"LEMMA":"there"})
ar.add([[{"TEXT":"tryna"}], [{"TEXT":"trynna"}]],{"LEMMA":"try"})
ar.add([[{"TEXT":"wanna"}]],{"LEMMA":"want"})
ar.add([[{"TEXT":"wewh"}]],{"LEMMA":"wew"})
ar.add([[{"TEXT":"whats"}], [{"TEXT":"wha"}]],{"LEMMA":"what"})
ar.add([[{"TEXT":"whereeeee"}], [{"TEXT": "wheeere"}], [{"TEXT": "whereee"}], [{"TEXT": "wheree"}], [{"TEXT": "wheres"}]],{"LEMMA":"where"})
ar.add([[{"TEXT":"yayyyy"}], [{"TEXT": "yayyy"}], [{"TEXT": "yayy"}], [{"TEXT": "yyyay"}]],{"LEMMA":"yay"})
ar.add([[{"TEXT":"yeahh"}], [{"TEXT": "yea"}]],{"LEMMA":"yeah"})
ar.add([[{"TEXT":"youuuu"}], [{"TEXT": "youuu"}]],{"LEMMA":"you"})
ar.add([[{"TEXT":"yoo"}]],{"LEMMA":"yo"})
ar.add([[{"TEXT":"uhh"}]],{"LEMMA":"uh"})
ar.add([[{"TEXT":"whoaa"}]],{"LEMMA":"whoa"})
ar.add([[{"TEXT":"wooh"}]],{"LEMMA":"woo"})
ar.add([[{"TEXT":"copter"}]],{"LEMMA":"helicopter"})
ar.add([[{"TEXT":"ahhwhere"}]],{"LEMMA":"anywhere"})
ar.add([[{"TEXT":"'ve"}]],{"LEMMA":" have"})
ar.add([[{"TEXT":"'s"}]],{"LEMMA":" be"})


## manually seperate incorrect spacing words
ar.add([[{"TEXT":"where’swhere"}]],{"LEMMA":"where be where"})
ar.add([[{"TEXT":"cancan"}]],{"LEMMA":"can can"})
ar.add([[{"TEXT":"childname]peekaboo"}]],{"LEMMA":"childname peekaboo"})
ar.add([[{"TEXT":"daddywhoo"}]],{"LEMMA":"dad who"})
ar.add([[{"TEXT":"doingwe're"}]],{"LEMMA":"do we be"})
ar.add([[{"TEXT":"ducksduck"}]],{"LEMMA":"duck duck"})
ar.add([[{"TEXT":"onetwothree"}]],{"LEMMA":"one two three"})
ar.add([[{"TEXT":"fourthreetwo"}]],{"LEMMA":"four three two"})
ar.add([[{"TEXT":"iwe"}]],{"LEMMA":"i we"})
ar.add([[{"TEXT":"mawhere"}]],{"LEMMA":"mum where"})
ar.add([[{"TEXT":"wannado"}]],{"LEMMA":"want to do"})
ar.add([[{"TEXT":"ta"}], [{"TEXT":"tha"}]],{"LEMMA":"that"})
ar.add([[{"TEXT":"th"}], [{"TEXT":"ti"}]],{"LEMMA":"this"})
ar.add([[{"TEXT":"youoh"}]],{"LEMMA":"you oh"})
ar.add([[{"TEXT":"where'swhere"}]],{"LEMMA":"where be where"})
ar.add([[{"TEXT":"hidecan"}]],{"LEMMA":"hide can"})
ar.add([[{"TEXT":"youyou"}]],{"LEMMA":"you you"})


## manually exclude laugh and gasp
ar.add([[{"TEXT":"laughs"}], [{"TEXT": "laugh"}], [{"TEXT": "laughing"}]],{"LEMMA":"laugh"})
ar.add([[{"TEXT":"gasps"}], [{"TEXT": "gasp"}]],{"LEMMA":"gasp"})
ar.add([[{"TEXT":"*"}], [{"TEXT": "_"}], [{"TEXT": "]"}], [{"TEXT": "["}], [{"TEXT": "_?"}], [{"TEXT": "."}], [{"TEXT": "s"}]],{"LEMMA":" "})

### Loop through file

1. tokenise the data
2. lemmentise data with customised words
3. clean the data (no punctuation, space and \n)
4. **Exclude data**
    - punctuation
    - space
    - X (other)
    - symbol
    - manual: manual_exlcude
    - two version:
        - exclude - interjection (eg., wow, whoa, yay)? [INTJ]
        - non-exclusion for interjection (modify [manual_exclude])
5. print out [word, word.lemma, word part of speech, explain part of speech]

In [6]:
# manually exclude words
manual_exclude = ["laugh", "gasp", "x0019_s", "x0014", "childname]_x0002", " ", "\x19s", "n", "?", "\x14", 
                  "\x19s-", "..", "!", "\x19d", "\x19re", "brr", "oi", "eh", "blabla", "dah", "woo", "oo",
                 "waaa", "yo", "op", "whoopsie", "psst", "weee", "wew", "co", "aw", "rawr", "whoa", "oye",
                 "hm", "dou", "aa", "hoo", "ewawa", "hmm", "buh", "tongtong", "ba", "whoo", "aye", "baa",
                 "naw", "hmph", "shh", "titiro", "hoi", "eieio", "ohp", "mm", "heh", "weh"]


for file_path in text_file_paths:
  with open(file_path, mode='r', encoding='cp1252') as text_file:
    file_name = os.path.basename(file_path)
    text = text_file.read()

    #cleaning texts
    text = text.lower()
    text = text.strip()
    text = nlp(text)
    
    
    filter_token = []          #filter_token: list of token after filtering lemma
    token_pos = []             #token: part of word
    unique = []                #unique: unique word
    lines = []                 #line: a list of filename, word, word.lemma, word.pos, explanation
    
    for word in text:
        if word.pos_ not in ["SPACE", "PUNCT", "X", "SYM", "INTJ"] and word.lemma_ not in manual_exclude:
            filter_token.append(word.lemma_)
            token_pos.append(word.pos_)
            line = [file_name, word, word.lemma_, word.pos_, spacy.explain(word.pos_)]
            lines.append(line)
            
            word_lemma = word.lemma_
            
#             print(word_lemma)
#             save clean text into a text for frequency analysis for the future
            with open(f"{FOLDER}{file_name}_clean_{CONDITION}.txt", mode="a") as clean_file:
                word_lemma = word_lemma.strip()
                clean_file.write(f"{word_lemma}\n")
            
            # unique token list
            for token in filter_token:
                if token not in unique:
                    unique.append(token)
    
  
    # numbers of words/ unique words
    total_num = len(filter_token)
    total_unique = len(unique)
    
    if total_unique == 0:
        TTR = 0
    else:
        TTR = total_unique/total_num
    
    # save output for final df
    data_row = pd.DataFrame(dict(zip(columns_final, [file_name, [unique], total_num, total_unique, TTR])))
    final_df = pd.concat([final_df, data_row], axis=0, ignore_index=True) 
    
    # save output for part of speech
    data_row_pos = pd.DataFrame(dict(zip(columns_pos, [lines])))
    pos_df  = pd.concat([pos_df, data_row_pos], axis=0, ignore_index=True) 

## save files

1. final_df = unique word file
    - filename, unique words, total numbers of a file, total unique word counts
2. pos_df = part of speech file
    - ['filename', 'original_word', 'word_lemma', 'word_pos']

In [16]:
print(unique)

['it', 'be', 'go', 'play', 'peekaboo', 'alright', 'this', 'we', 'to', 'where', 'mother', 'boo', 'you', 'want', 'look', 'in', 'behind', 'here', 'well', 'he', 'childname', 'there', 'she', 'I', 'have', 'get', 'the', 'duck', 'sorry', 'darling', 'change', 'do', 'with', 'my', 'face', 'can', 'see', 'animal', 'no', 'but', 'what', 'just', 'and', 'like', 'a', 'magician', 'make', 'they', 'disappear', 'know', 'say', 'remember', 'how', 'people', 'ready', 'cover', 'count', 'again', 'your', 'eye', 'one', 'two', 'now', 'try', 'that', 'okay', 'time', 'three', 'peeka', 'hide', 'hang', 'on', 'drop', 'helicopter', 'at', 'turn', 'should', 'new', 'yourself', 'thing', 'let', 'around', 'gosh', 'leave', 'screen', 'so', 'many', 'interesting', 'for', 'nose', 'would', 'curtain', 'goat', 'about', 'nah', 'cow', 'farm', 'later', 'back', 'not', 'out', 'come', 'nothing', 'yay', 'sit', 'baby', 'hand', 'bubba', 'will', 'bah', 'pardon', 'bag', 'yep', 'alone', 'could', 'must', 'her', 'hello', 'sweetheart', 'show', 'right'

In [7]:
print(total_num)
final_df.to_csv(OUTPUT_PATH_final)
final_df.head()

3065


Unnamed: 0,filename,unique_words,total_number_words,unique_word_count,TTR
0,0_NC_clean_pooling.txt,"[it, be, go, play, peekaboo, alright, this, we...",4541,233,0.05131
1,1_C_clean_pooling.txt,"[where, be, mother, I, have, get, the, duck, s...",2245,197,0.087751
2,clean_all_text.txt,"[it, be, go, play, peekaboo, alright, this, we...",6816,291,0.042694
3,CV-0_clean_overall_contigent.txt,"[it, be, go, play, peekaboo, alright, this, we...",2390,196,0.082008
4,CV-1_clean_overall_contigent.txt,"[where, be, mother, I, have, get, the, duck, s...",1335,158,0.118352


In [8]:
# print(token_pos)
pos_df.to_csv(OUTPUT_PATH_pos)
pos_df.head()

Unnamed: 0,"filename, word, word_lemma, word_pos, pos_explain"
0,"[0_NC_clean_pooling.txt, it, it, PRON, pronoun]"
1,"[0_NC_clean_pooling.txt, be, be, AUX, auxiliary]"
2,"[0_NC_clean_pooling.txt, go, go, VERB, verb]"
3,"[0_NC_clean_pooling.txt, play, play, VERB, verb]"
4,"[0_NC_clean_pooling.txt, peekaboo, peekaboo, N..."
