In [19]:
import string
import pandas as pd
from os import listdir

# get list of all files in the scripts directory
files = [f for f in listdir("scripts\\")]

In [None]:
! pip install nltk

In [17]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
# create a dictionary with key as filename and script as value
scripts_dict = {}
for fl in files:
    file_path = 'scripts\\' + fl
#     print(file_path)
    with open(file_path, encoding="utf8") as f:
        txt = f.read()
        scripts_dict[fl] = txt


In [4]:
# convert the dictionary to a pandas dataframe

scripts_df = pd.DataFrame(scripts_dict.items(), columns=['genre_movie_name', 'script'])

In [5]:
# remove .txt suffix from movie_name column

scripts_df['movie_name'] = scripts_df['genre_movie_name'].apply(lambda x: x.replace('.txt', '').split('_')[1])
scripts_df['genre'] = scripts_df['genre_movie_name'].apply(lambda x: x.replace('.txt', '').split('_')[0])

In [6]:
# grouping the genres into a list of genres

genre_df = scripts_df[['movie_name', 'genre']]

genre_df = pd.DataFrame(genre_df.groupby('movie_name').agg(list)).reset_index()
genre_df

Unnamed: 0,movie_name,genre
0,10-things-i-hate-about-you,"[comedy, romance]"
1,12,[comedy]
2,12-and-holding,[drama]
3,12-monkeys,"[drama, science-fiction, thriller]"
4,127-hours,"[adventure, drama, thriller]"
...,...,...
949,yes-man,"[comedy, romance]"
950,you-can-count-on-me,[drama]
951,youth-in-revolt,"[comedy, drama, romance]"
952,youve-got-mail,"[comedy, romance]"


In [7]:
# drop duplicates in scripts and movie_name columns

scripts_df = scripts_df[['script', 'movie_name']].drop_duplicates()

In [8]:
scripts_df = pd.merge(scripts_df, genre_df, how='left', on='movie_name')

In [9]:
scripts_df.shape

(954, 3)

In [10]:
scripts_df

Unnamed: 0,script,movie_name,genre
0,\n\n\n\ton the words CZECH AIRLINE. We are pa...,15-minutes,"[action, crime, thriller]"
1,\n\n ...,2012,"[action, adventure, drama, science-fiction, th..."
2,\n\n\n\n\n\n\n ...,30-minutes-or-less,"[action, adventure, comedy]"
3,\n\n\n\n\n\n Endless green hills ...,48-hrs,"[action, comedy, thriller]"
4,A PERFECT WORLD\n\n Written by\n\n\n\n\n...,a-perfect-world,"[action, crime, drama]"
...,...,...,...
949,\n\n\n\n Written ...,shifty,[thriller]
950,\n\n\n\n\n\n\nThe screen is black. Thunder rum...,spare-me,[thriller]
951,\n\n\n\n\n\n\n ...,the-assignment,[thriller]
952,\n\n\n\n\n\n\n ...,roughshod,[western]


In [11]:
def get_tagged_sentences(txt):
    sentence = nltk.sent_tokenize(txt)
    tagged_sentences = []
    for sent in sentence:
        tagged_sentences.append(nltk.pos_tag(nltk.word_tokenize(sent)))
    return tagged_sentences

In [12]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords 

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

def get_lemmatized_words(tagged_sentences):
    lemmatizer = WordNetLemmatizer()
    ENGLISH_STOP_WORDS = stopwords.words('english')
    all_words = []
    for sent in tagged_sentences:
        for (word, pos) in sent:
            if word not in string.punctuation and word not in ENGLISH_STOP_WORDS:
                wnet_pos = get_wordnet_pos(pos)
                if wnet_pos != '':
                    all_words.append(lemmatizer.lemmatize(word, wnet_pos))
                else:
                    all_words.append(lemmatizer.lemmatize(word))
    return all_words

In [13]:
def get_pos_simple(treebank_tag):

    if treebank_tag.startswith('J'):
        return 'adj'
    elif treebank_tag.startswith('V'):
        return 'verb'
    elif treebank_tag.startswith('N'):
        if treebank_tag.startswith('NNP'):
            return 'proper_noun'
        return 'noun'
    elif treebank_tag.startswith('R'):
        return 'adv'
    else:
        return ''

def get_pos_counts(tagged_sentences):
    pos_dict = {}
    proper_nouns = []
    for sent in tagged_sentences:
        for (wrd,pos) in sent:
            pos = get_pos_simple(pos)
            if pos == 'proper_noun':
                proper_nouns.append(wrd)
                pos = 'noun'
            if pos in pos_dict:
                pos_dict[pos] += [wrd]
            else:
                pos_dict[pos] = [wrd]
    pos_count_dict = {}
    pos_unique_count_dict = {}
    for item in pos_dict:
        pos_count_dict[item] = len(pos_dict[item])
        pos_unique_count_dict[item] = len(set(pos_dict[item])) # converting to unique words only
    return len(list(set(proper_nouns))), len(proper_nouns), pos_count_dict, pos_unique_count_dict

In [14]:
def extract_text_features(row):
    txt = row['script']
    txt = txt.replace('\n',' ').replace('\t', ' ').replace('\r', ' ')
    sent = get_tagged_sentences(txt)
    sentence_count = len(sent)
    words = get_lemmatized_words(sent)
    word_count = len(words)
    words_nunique = len(set(words))
    (prop_nunique, prop_count, pos_counts_dict, pos_nunique_dict) = get_pos_counts(sent)
    verb_percent = pos_counts_dict['verb']*100.0/word_count
    noun_percent = pos_counts_dict['noun']*100.0/word_count
    adj_percent = pos_counts_dict['adj']*100.0/word_count
    adv_percent = pos_counts_dict['adv']*100.0/word_count

    verb_unique_percent = pos_nunique_dict['verb']*100.0/words_nunique
    noun_unique_percent = pos_nunique_dict['noun']*100.0/words_nunique
    adj_unique_percent = pos_nunique_dict['adj']*100.0/words_nunique
    adv_unique_percent = pos_nunique_dict['adv']*100.0/words_nunique

    return [prop_count, verb_percent, noun_percent, adj_percent, adv_percent,
            prop_nunique, verb_unique_percent, noun_unique_percent, 
            adj_unique_percent, adv_unique_percent]
#     except:
#         print(row['movie_name'])



In [15]:
scripts_df

Unnamed: 0,script,movie_name,genre
0,\n\n\n\ton the words CZECH AIRLINE. We are pa...,15-minutes,"[action, crime, thriller]"
1,\n\n ...,2012,"[action, adventure, drama, science-fiction, th..."
2,\n\n\n\n\n\n\n ...,30-minutes-or-less,"[action, adventure, comedy]"
3,\n\n\n\n\n\n Endless green hills ...,48-hrs,"[action, comedy, thriller]"
4,A PERFECT WORLD\n\n Written by\n\n\n\n\n...,a-perfect-world,"[action, crime, drama]"
...,...,...,...
949,\n\n\n\n Written ...,shifty,[thriller]
950,\n\n\n\n\n\n\nThe screen is black. Thunder rum...,spare-me,[thriller]
951,\n\n\n\n\n\n\n ...,the-assignment,[thriller]
952,\n\n\n\n\n\n\n ...,roughshod,[western]


In [20]:
scripts_df['features'] = scripts_df.apply(lambda x: extract_text_features(x), axis=1)

In [None]:
scripts_df

In [22]:
scripts_df.to_csv('script_text_features.csv')

## ================================================

In [23]:
with open('scripts\\action_15-minutes.txt', encoding="utf=8") as f:
    txt = f.read()

In [24]:
txt = txt.replace('\n',' ').replace('\t', ' ').replace('\r', ' ')
sent = get_tagged_sentences(txt)
print(sent)



In [25]:
words = get_lemmatized_words(sent)
print(words)



In [26]:
print(get_pos_counts(sent))

(594, 2404, {'': 14986, 'noun': 6707, 'verb': 4904, 'adv': 1584, 'adj': 956}, {'': 283, 'noun': 2104, 'verb': 1185, 'adv': 187, 'adj': 477})


In [27]:
# defining a script to do it all together

def extract_text_features(txt):
    txt = txt.replace('\n',' ').replace('\t', ' ').replace('\r', ' ')
    sent = get_tagged_sentences(txt)
    sentence_count = len(sent)
    words = get_lemmatized_words(sent)
    word_count = len(words)
    words_nunique = len(set(words))
    (prop_nunique, prop_count, pos_counts_dict, pos_nunique_dict) = get_pos_counts(sent)
    verb_percent = pos_counts_dict['verb']*100.0/word_count
    noun_percent = pos_counts_dict['noun']*100.0/word_count
    adj_percent = pos_counts_dict['adj']*100.0/word_count
    adv_percent = pos_counts_dict['adv']*100.0/word_count

    verb_unique_percent = pos_nunique_dict['verb']*100.0/words_nunique
    noun_unique_percent = pos_nunique_dict['noun']*100.0/words_nunique
    adj_unique_percent = pos_nunique_dict['adj']*100.0/words_nunique
    adv_unique_percent = pos_nunique_dict['adv']*100.0/words_nunique

    return [prop_count, verb_percent, noun_percent, adj_percent, adv_percent,
            prop_nunique, verb_unique_percent, noun_unique_percent, 
            adj_unique_percent, adv_unique_percent]

In [28]:
print(extract_text_features(txt))

[2404, 32.2165287084483, 44.06122717119958, 6.280383655235843, 10.40599132834056, 594, 38.275193798449614, 67.95865633074935, 15.406976744186046, 6.040051679586563]
