# Data cleaning
We transform string data into matrices:
- movie_names matrix
- movie_rating matrix
- actor/actress names matrix
- actor/actress actin boolean matrix

## Setup

### Import library

In [None]:
import numpy as np
import re
import os

### Setup constant

In [None]:
pattern = re.compile('\s{2,}')
actor_pattern = re.compile('\t{1,}')
year_in_name_pattern = re.compile('^(?:.+\()([1-2][0-9]{3})(?:.*\))(?:\s\{.*\}){0,1}$')
min_vote = 1000

## Cleaning

### Movie and Ratings

In [None]:
%%time

# movie name and rating matrix
movie_names = np.array([], dtype=str)
ratings = np.array([], dtype=float)

with open('../data_raw/ratings.list', encoding='ISO-8859-1') as fs:
    line = fs.readline()
    count = 0
            
    # skip top lines
    while line != 'MOVIE RATINGS REPORT\n':
        line = fs.readline()
    
    for i in range(3):
        line = fs.readline()
        
    while line and line != '\n':
        line = line.strip()
        arr = re.split(pattern, line)
        
        # consider only movies' vote > 1000
        # only take in movies
        vote_count = int(arr[1])
        if vote_count > min_vote and arr[3][0] != "\"" and arr[3][-1] != "\"":
            count += 1
            movie_names = np.append(movie_names, [str(arr[3])])
            ratings = np.append(ratings, [float(arr[2])])
        
        line = fs.readline()

np.savetxt('../data_processed/matrix/mat_movie_names.csv', movie_names.T, delimiter=",", fmt='\"%s\"')
np.savetxt('../data_processed/matrix/mat_ratings.csv', ratings.T, delimiter=",", fmt='%.1f')

### Actresses

In [None]:
%%time

actors = np.array([], dtype=str)
actin_pos = np.array([], dtype=int).reshape(0,2)

with open('../data_raw/actresses.list', encoding='ISO-8859-1') as fs:
    line = fs.readline()
    count = 0
    
    # skip top lines
    while line != 'THE ACTRESSES LIST\n':
        line = fs.readline()
    
    for i in range(5):
        line = fs.readline()
    
    # for each actor
    while line != '-----------------------------------------------------------------------------\n':
        arr = re.split(actor_pattern, line)
        name = arr[0]
        movie = arr[1].strip()
        
        movies = []
        arr = re.split(pattern, movie)
        if arr[0][0] != "\"" and arr[0][-1] != "\"":
            movies.append(arr[0])
        line = fs.readline()
        # get rest of the movies
        while line != '\n':
            line = line.strip()
            arr = re.split(pattern, line)
            if arr[0][0] != "\"" and arr[0][-1] != "\"":
                movies.append(arr[0])
            line = fs.readline()
        
        if len(movies) > 0:
            mask = np.in1d(movie_names, movies)
            actin = np.where(mask == 1)
            involved_count = actin[0].shape[0]
            if involved_count > 0:
                actin_pos = np.append(actin_pos, np.append(
                    np.full((1,involved_count), count),
                    actin,
                    axis=0).T, axis=0)
                actors = np.append(actors, [str(name)])
                count += 1
        line = fs.readline()
        
print ('writing names...')
print ('names length: %d' % len(actors))
np.savetxt('../data_processed/matrix/mat_actresses_names.csv', actors.T, delimiter=",", fmt='\"%s\"')
print ('writing actin...')
np.savez('../data_processed/matrix/mat_actresses_actin', actin_pos=actin_pos, shape=(count, movie_names.shape[0]))

### Actors

In [None]:
%%time

actors = np.array([], dtype=str)

actin_pos = np.array([], dtype=int).reshape(0,2)

with open('../data_raw/actors.list', encoding='ISO-8859-1') as fs:
    line = fs.readline()
    count = 0
    
    # skip top lines
    while line != 'THE ACTORS LIST\n':
        line = fs.readline()
    
    for i in range(5):
        line = fs.readline()
    
    # for each actor
    while line != '-----------------------------------------------------------------------------\n':
        arr = re.split(actor_pattern, line)
        name = arr[0]
        movie = arr[1].strip()
        
        movies = []
        arr = re.split(pattern, movie)
        if arr[0][0] != "\"" and arr[0][-1] != "\"":
            movies.append(arr[0])
        line = fs.readline()
        # get rest of the movies
        while line != '\n':
            line = line.strip()
            arr = re.split(pattern, line)
            if arr[0][0] != "\"" and arr[0][-1] != "\"":
                movies.append(arr[0])
            line = fs.readline()
        
        if len(movies) > 0:
            mask = np.in1d(movie_names, movies)
            actin = np.where(mask == 1)
            involved_count = actin[0].shape[0]
            if involved_count > 0:
                actin_pos = np.append(actin_pos, np.append(
                    np.full((1,involved_count), count),
                    actin,
                    axis=0).T, axis=0)
                actors = np.append(actors, [str(name)])
                count += 1
        line = fs.readline()
        
print ('writing names...')
print ('names length: %d' % len(actors))
np.savetxt('../data_processed/matrix/mat_actors_names.csv', actors.T, delimiter=",", fmt='\"%s\"')
print ('writing actin...')
np.savez('../data_processed/matrix/mat_actors_actin', actin_pos=actin_pos, shape=(count, movie_names.shape[0]))