# NLTK Test on Final Fantasy Data

This is a short test of importing FF data into a Jupyter notebook created in VS Code.

## Set up dataframe

In [22]:
import pandas as pd

Create initial data frame with file

In [23]:
df = pd.read_csv('script_data/world-of-ff-script.csv')
df.head()

Unnamed: 0,Original,Character,Dialogue,Wordcount
0,???:Prologue - Awake at Last,? ? ?,Prologue - Awake at Last,5
1,"???:Tick, tock, tick, tock...",? ? ?,"Tick, tock, tick, tock...",4
2,???:Time to wake up.,? ? ?,Time to wake up.,4
3,???:*Yawn* What a nap...,? ? ?,*Yawn* What a nap...,4
4,"???:Oh, right. I gotta get to work.",? ? ?,"Oh, right. I gotta get to work.",7


Fix the data by setting the index column

In [24]:
fixed_df = pd.read_csv('script_data/world-of-ff-script.csv', index_col='Character')
fixed_df.head()

Unnamed: 0_level_0,Original,Dialogue,Wordcount
Character,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
? ? ?,???:Prologue - Awake at Last,Prologue - Awake at Last,5
? ? ?,"???:Tick, tock, tick, tock...","Tick, tock, tick, tock...",4
? ? ?,???:Time to wake up.,Time to wake up.,4
? ? ?,???:*Yawn* What a nap...,*Yawn* What a nap...,4
? ? ?,"???:Oh, right. I gotta get to work.","Oh, right. I gotta get to work.",7


Show isolated Character:Dialogue key value pairs

In [25]:
fixed_df['Dialogue']

Character
? ? ?                                      Prologue - Awake at Last
? ? ?                                     Tick, tock, tick, tock...
? ? ?                                              Time to wake up.
? ? ?                                          *Yawn* What a nap...
? ? ?                               Oh, right. I gotta get to work.
                                        ...                        
Female Lilikin    Now there's nothing stopping me from enjoying ...
Soldier           Ugh... What a pain... Huh? Say, you wouldn't h...
Soldier                                        Good luck out there!
Soldier           Hey, you're back! Thanks for doing my job for ...
Soldier                         Ehh... Man, I don't want to work...
Name: Dialogue, Length: 2017, dtype: object

## Begin NLTK tokenization

Note that you may need to download NLTK libraries, as prompted

In [26]:
import nltk
# The following ay not be needed, depending on NLTK version
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [27]:
example_text = "This is going to does an exciting blades action intelligently scrappy writing writers writer write performer perform performers performance."
# example_text = fixed_df['Dialogue'][43]
sentences = sent_tokenize(example_text)

print(sent_tokenize(example_text))

['This is going to does an exciting blades action intelligently scrappy writing writers writer write performer perform performers performance.']


In [28]:
lemmatizer = WordNetLemmatizer()
new__lemmatized_sentence = []
for i in range(len(sentences)):
    words = word_tokenize(sentences[i])
    words = [lemmatizer.lemmatize(word) for word in words]
    new__lemmatized_sentence.append(' '.join(words))
print(new__lemmatized_sentence)

['This is going to doe an exciting blade action intelligently scrappy writing writer writer write performer perform performer performance .']


In [34]:
# You can change the row location in the fixed_df dataframe below to access different sentences
ff_sent = fixed_df['Dialogue'][6]
ff_sentences = sent_tokenize(ff_sent)

print(sent_tokenize(ff_sent))

lemmatized_ff_data = []
for i in range(len(ff_sentences)):
    words = word_tokenize(ff_sentences[i])
    words = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_ff_data.append(' '.join(words))
print(lemmatized_ff_data)

["Then again, it's not like business has picked up any."]
["Then again , it 's not like business ha picked up any ."]
