# Preprocessing


In [17]:
using Statistics, LinearAlgebra, Random, DataFrames, Plots, StatsPlots, CSV, JSON

In [4]:
data = CSV.read("movies_metadata.csv")



Unnamed: 0_level_0,adult,belongs_to_collection
Unnamed: 0_level_1,String,String⍰
1,False,"{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}"
2,False,missing
3,False,"{'id': 119050, 'name': 'Grumpy Old Men Collection', 'poster_path': '/nLvUdqgPgm3F85NMCii9gVFUcet.jpg', 'backdrop_path': '/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg'}"
4,False,missing
5,False,"{'id': 96871, 'name': 'Father of the Bride Collection', 'poster_path': '/nts4iOmNnq7GNicycMJ9pSAn204.jpg', 'backdrop_path': '/7qwE57OVZmMJChBpLEbJEmzUydk.jpg'}"
6,False,missing
7,False,missing
8,False,missing
9,False,missing
10,False,"{'id': 645, 'name': 'James Bond Collection', 'poster_path': '/HORpg5CSkmeQlAolx3bKMrKgfi.jpg', 'backdrop_path': '/6VcVl48kNKvdXOZfJPdarlUGOsk.jpg'}"


In [48]:
y = data[:,:vote_average]

45466-element Array{Union{Missing, Float64},1}:
 7.7
 6.9
 6.5
 6.1
 5.7
 7.7
 6.2
 5.4
 5.5
 6.6
 6.5
 5.7
 7.1
 ⋮  
 4.0
 6.0
 6.3
 7.0
 7.0
 3.5
 5.7
 4.0
 9.0
 3.8
 0.0
 0.0

In [5]:
names(data)

24-element Array{Symbol,1}:
 :adult                
 :belongs_to_collection
 :budget               
 :genres               
 :homepage             
 :id                   
 :imdb_id              
 :original_language    
 :original_title       
 :overview             
 :popularity           
 :poster_path          
 :production_companies 
 :production_countries 
 :release_date         
 :revenue              
 :runtime              
 :spoken_languages     
 :status               
 :tagline              
 :title                
 :video                
 :vote_average         
 :vote_count           

In [6]:
"Computes a onehot vector for every entry in column given a set of categories cats"
function onehot(column, cats=unique(column))
    result = zeros(size(column)[1], size(cats)[1])
    new_cats = Dict()
    for (n, f) in enumerate(cats)
       new_cats[f] = n
    end
    
    for i in 1:size(column)[1]
        index = get(new_cats, column[i], 0)
        if index != 0
            result[i, index] = 1
        end
    end
    result
end

onehot

In [61]:
"Computes a manyhot vector for every entry in an array of arrays given a set of categories cats"
function manyhot(column)
    cats = Set()
    for set in column
        for item in set
            push!(cats, item)
        end
    end
    cats = unique(cats)
    
    result = zeros(size(column)[1], size(cats)[1])
    new_cats = Dict()
    for (n, f) in enumerate(cats)
       new_cats[f] = n
    end
    
    for i in 1:size(column)[1]
        set_of_descriptions = Set(column[i])
        for description in set_of_descriptions
            index = get(new_cats, description, 0)
            if index != 0
                result[i, index] = 1
            end
        end
    end
    result
end

manyhot

Preprocessing genre using many-hot encoding

In [141]:
function preprocess_json(d)
    if ismissing(d) || d == "False"
        return Set()
    end
    x = d
    x = replace(x, "\"" => "\'")
    x = replace(x, "\\xa0" => " ")
    x = replace(x, "'name'" => "\"name\"")
    x = replace(x, "'id'" => "\"id\"")
    x = replace(x, ": '" => ": \"")
    x = replace(x, "'}" => "\"}")
    x = replace(x, "'," => "\",")
    x = replace(x, "Orlenok\"" => "Orlenok'")
    x = replace(x, "'iso_3166_1'" => "\"iso_3166_1\"")
    result = Set{String}()
    try
        for dic in JSON.parse(x)
            push!(result, dic["name"])
        end
    catch err
        println(d)
        println(x)
        println("=============================")
        return Set()
    end
    return result
end

preprocess_json (generic function with 1 method)

In [142]:
genres = manyhot(map(preprocess_json,data[:,:genres]))

45466×32 Array{Float64,2}:
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  1.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  1.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  …  0.0  0.0  0.0  0.0  0.0  1.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  1.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  1.0  1.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  1.0  1.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  1.0  1.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  1.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  1

preprocessing production company

In [133]:
production_companies = manyhot(map(preprocess_json,data[:,:production_companies]))

45466×23537 Array{Float64,2}:
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0

There are so many production companies and we don't want to overfit our data, so we exclude it for now.

Preprocessing production countries

In [143]:
production_countries = manyhot(map(preprocess_json,data[:,:production_countries]))

4.3
4.3
6.0
6.0
7.0
7.0


45466×160 Array{Float64,2}:
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     0.0  0.0  0.0  0.0  0.0  0.0  0.0
 1.0  0.0  0.0  0.0  0.0  0.0  0.0  