### Creating New Data Samples from IMSDB



Total number of data points: 22579

#### The original training set had this distribution of genres:


Action: 2392

Adventure: 147

Comedy: 2941

Drama: 8873

Horror: 456

Other: 270

Romance: 63

Sci-Fi: 613

Thriller: 6824

Since the Dataset is so imbalanced, let's create enough samples of each genre to get to 8000 per genre.

In [32]:
import os
import collections

import numpy as np
import pandas as pd

In [5]:
datapath = '/Users/yngtodd/data/imsdb_raw_nov_2015/external'

In [9]:
num_action = 8000 - 2392
num_adventure = 8000 - 147
num_comedy = 8000 - 2941
num_horror = 8000 - 456
num_romance = 8000 - 63
num_scifi = 8000 - 613
num_thriller = 8000 - 6824

### Strategy

For each genre, select a script at random, grab 1000 random characters.

### Action

In [180]:
print(f'Number of lines we need from action movies: {num_action}')

Number of lines we need from action movies: 5608


In [14]:
action_scripts = os.path.join(datapath, 'Action')

In [15]:
scripts = os.listdir(action_scripts)

In [28]:
# Which scripts will we grab from 
script_idxs = np.random.randint(low=0, high=len(scripts)-1, size=num_action)

In [41]:
# Make sure we don't open the same script multiple times. 
counter_scripts = collections.Counter(script_idxs)

In [44]:
# Key: index indicating the script to open
# value: number of 1000 character snippets we should grab.
for script, num_lines in counter_scripts.items():
    print(f'Script: {script}, Number of 1000 chars to grab: {num_lines}')
    break

Script: 258, Number of 1000 chars to grab: 20


In [179]:
def sample_from_genre(genre_path, num_lines, chars_per_line=1000):
    scripts = os.listdir(genre_path)
    # Which scripts will we grab from 
    script_idxs = np.random.randint(low=0, high=len(scripts)-1, size=num_lines)
    # Make sure we don't open the same script multiple times. 
    counter_scripts = collections.Counter(script_idxs)
    # Key: index indicating the script to open
    # value: number of chars_per_line character snippets we should grab from key.
    all_lines = []
    for idx, n_lines in counter_scripts.items():
        script_path = os.path.join(genre_path, scripts[idx])
        lines = sample_from_script(script_path, n_lines, chars_per_line)
        all_lines.extend(lines)
    
    return all_lines

In [171]:
def sample_from_script(script_path, num_lines, chars_per_line):
    """Sample num_lines from a script.
    
    Parameters
    ----------
    script_path : str
        Path to the script
    
    num_lines : int
        Number of lines to sample.
        
    chars_per_line : int
        Numer of consecutive characters considered a line.
        
    Returns
    -------
    lines : List
        All the sampled lines. 
    """
    script = read_script(script_path)
    script = split_n_lines(script, num_chars=chars_per_line)
    lines = np.random.choice(script, num_lines)
    return lines

### Example Script 

In [151]:
my_script = os.path.join(action_scripts, scripts[0])

with open(my_script, 'r') as f:
    file = f.read().splitlines()

In [152]:
all_lines = []
for line in file:
    line = " ".join(line.split())
    all_lines.append(line)

In [168]:
def read_script(path: str) -> str:
    with open(my_script, 'r') as f:
        file = f.read().splitlines()
    
    all_lines = []
    for line in file:
        line = " ".join(line.split())
        all_lines.append(line)
    
    all_lines = ' '.join(all_lines)
    
    return all_lines

In [158]:
n = 1000
all_lines2 = [all_lines[i:i+n] for i in range(0, len(all_lines), n)]

In [159]:
from typing import List


def split_n_lines(script: str, num_chars: int=1000) -> List:
    return [script[i:i+n] for i in range(0, len(script), n)]

In [160]:
script2 = split_n_lines(all_lines)

In [162]:
script2[1] == all_lines2[1]

True

In [164]:
len(script2[1])

1000

In [169]:
charlies_angels = read_script(my_script)
lines = split_n_lines(charlies_angels)

### Quick Test

In [172]:
lines = sample_from_script(my_script, 10, 1000)

In [173]:
len(lines)

10

In [176]:
len(lines[1])

1000

In [181]:
all_action_lines = sample_from_genre(action_scripts, num_action, 1000)

In [182]:
len(all_action_lines)

5608

In [183]:
len(all_action_lines[0])

1000