# Example: Loading and Analyzing the Sarcasm Dataset
Fill me

## Setup
We set up the computational environment by including [the `Include. jl` file](Include.jl) using [the `include(...)` method](https://docs.julialang.org/en/v1/base/base/#Base.include). The [`Include.jl` file](Include.jl) loads external packages and functions we will use in these examples. 
* For additional information on functions and types used in this example, see the [Julia programming language documentation](https://docs.julialang.org/en/v1/). 

In [3]:
include("Include.jl");

## Task 1: Load the sarcasm dataset
Fill me in

In [5]:
dataset = joinpath(_PATH_TO_DATA, "Sarcasm_Headlines_Dataset_v2.txt") |> corpus;

Fill me in

In [7]:
dataset.records[1]

MySarcasmRecordModel(true, "thirtysomething scientists unveil doomsday clock of hair loss", "https://www.theonion.com/thirtysomething-scientists-unveil-doomsday-clock-of-hai-1819586205")

In [8]:
typeof(dataset.records)

Dict{Int64, MySarcasmRecordModel}

## Task 2: Build the tokens dictionary
Fill me in

Fill me in

In [11]:
tokenarray = Array{String,1}();
for (k,v) ∈ dataset.records

    # process headline data -
    headline = v.headline;
    tokens = split(headline, " ") .|> String;

    # process -
    for token ∈ tokens

        # strip any leading or trailing spaces -
        token = strip(token, ' ');
        
        if (in(token, tokenarray) == false && isempty(token) == false)
            push!(tokenarray, token);
        end
    end 
end

Fill me in

In [13]:
tokenarray |> sort!

29662-element Vector{String}:
 "#"
 "##"
 "#1"
 "#11717"
 "#4"
 "#5"
 "#addcandytoamovie"
 "#addclimatechangetotv"
 "#alohahuffpost"
 "#alternativefacts"
 "#badpicturemonday"
 "#blacklivesmatter"
 "#brownribboncampaign"
 ⋮
 "zoologists"
 "zoomed"
 "zoos"
 "zoroastrianism"
 "zs"
 "zsa"
 "zucker"
 "zuckerberg"
 "zuckerbergs"
 "zz"
 "éclairs"
 "ünited"

In [14]:
"jacquie" ∈ tokenarray 

true

Fill me in

In [16]:
tokendictionary = Dict{String, Int64}();
for i ∈ eachindex(tokenarray)
    key = tokenarray[i]
    tokendictionary[key] = i; 
end

In [17]:
tokendictionary

Dict{String, Int64} with 29662 entries:
  "rosecolored"           => 22746
  "trumpland"             => 27336
  "irreplaceable"         => 14091
  "cluelessly"            => 5597
  "syriaalthough"         => 26086
  "bumbum"                => 4204
  "#addclimatechangetotv" => 8
  "dumber"                => 8590
  "daraya"                => 7072
  "jakrapong"             => 14213
  "henry"                 => 12550
  "skylight"              => 24355
  "bidder"                => 3239
  "abducted"              => 934
  "rises"                 => 22533
  "hampshire"             => 12131
  "droogs"                => 8492
  "whiz"                  => 29009
  "buffetts"              => 4152
  "il"                    => 13307
  "celebfilled"           => 4868
  "msnbc"                 => 17516
  "starches"              => 25204
  "tribunal"              => 27224
  "lovers"                => 15869
  ⋮                       => ⋮

Fill me in

In [19]:
dataset.tokens = tokendictionary;

## Task 3: Tokenize headline records
Fill me in

In [21]:
number_of_records = dataset.records |> length; # what is going on here?

### Look at a random record
Fill me in

In [23]:
random_test_record = rand(1:number_of_records) |> i -> dataset.records[i]

MySarcasmRecordModel(false, "frances marine le pen backs trump and denounces clinton", "https://www.huffingtonpost.com/entry/le-pen-trump_us_57c74883e4b0e60d31dd076c")

In [24]:
random_test_record.headline

"frances marine le pen backs trump and denounces clinton"

In [25]:
tv = tokenize(random_test_record.headline, dataset.tokens)

9-element Vector{Int64}:
 10747
 16306
 15246
 19557
  2613
 27318
  1739
  7503
  5538

### Compute the maximum pad length
Fill me in

In [27]:
max_pad_length = 0;
for i ∈ 1:number_of_records
    test_record_length = tokenize(dataset.records[i].headline, dataset.tokens) |> length;
    if (test_record_length > max_pad_length)
        max_pad_length = test_record_length;
    end
end

### Compute the number of sarcasm and non-sarcasm samples

Fill me in

In [30]:
number_of_sarcasm_samples = 0;
number_of_non_sarcasm_samples = 0;
for i ∈ 1:number_of_records
    
    is_sarcastic_flag = dataset.records[i].issarcastic
    if (is_sarcastic_flag == true)
        number_of_sarcasm_samples += 1;
    else
        number_of_non_sarcasm_samples+=1;
    end
end

### Compute the vector representation of the sarcastic samples
Fill me in

In [65]:
sarcasim_sample_dictionary = Dict{Int64, Array{Int64,1}}();
for i ∈ 1:number_of_records
    is_sarcastic_flag = dataset.records[i].issarcastic
    if (is_sarcastic_flag == true)
        v = tokenize(dataset.records[i].headline, dataset.tokens, 
            pad = max_pad_length);        
        sarcasim_sample_dictionary[i] = v;
    end
end
sarcasim_sample_dictionary[1]

151-element Vector{Int64}:
 26616
 23294
 27979
  8294
  5552
 18532
 12046
 15827
     0
     0
     0
     0
     0
     ⋮
     0
     0
     0
     0
     0
     0
     0
     0
     0
     0
     0
     0

### Compute the vector representation of the non-sarcastic samples
Fill me in

In [34]:
non_sarcasim_sample_dictionary = Dict{Int64, Array{Int64,1}}();
for i ∈ 1:number_of_records
    is_sarcastic_flag = dataset.records[i].issarcastic
    if (is_sarcastic_flag == false)
        v = tokenize(dataset.records[i].headline, dataset.tokens, 
            pad = max_pad_length);        
        non_sarcasim_sample_dictionary[i] = v;
    end
end
non_sarcasim_sample_dictionary

Dict{Int64, Vector{Int64}} with 14985 entries:
  12427 => [17646, 22222, 26825, 12326, 20944, 29191, 28513, 21851, 8482, 0  … …
  7685  => [26617, 26362, 27361, 26825, 16116, 26533, 22567, 22966, 29483, 3016…
  3406  => [11740, 15063, 1642, 6107, 26533, 20434, 18532, 6329, 13457, 20186  …
  1090  => [5538, 1812, 27111, 15251, 24022, 23869, 29113, 13457, 18310, 0  …  …
  18139 => [3841, 19962, 7649, 28806, 14099, 24646, 24646, 24646, 24646, 23208 …
  17088 => [20832, 20914, 29113, 20689, 13457, 18251, 4680, 0, 0, 0  …  0, 0, 0…
  16805 => [6360, 14099, 4039, 4502, 6090, 10346, 14140, 0, 0, 0  …  0, 0, 0, 0…
  11251 => [26533, 27946, 27408, 1739, 18293, 3248, 17615, 26825, 2931, 26533  …
  25327 => [26148, 29544, 14745, 26825, 29281, 7135, 22262, 13457, 18319, 17991…
  8060  => [723, 28769, 26825, 6931, 28824, 6452, 1031, 26825, 28824, 20002  … …
  14167 => [17739, 4489, 29086, 9773, 29191, 15044, 11060, 13457, 1670, 12940  …
  8660  => [26533, 9718, 18797, 15497, 1739, 7183, 18532, 8924

In [35]:
@assert length(non_sarcasim_sample_dictionary) == number_of_non_sarcasm_samples

## Final: Save data to disk
Finally, we did a bunch of stuff in this example, and we don't want to have to recompute the corpus, token dictionary, etc. So let's save it [in an HDF5 encoded binary file](https://en.wikipedia.org/wiki/Hierarchical_Data_Format). To start, specify a path:

In [37]:
path_to_save_file = joinpath(_PATH_TO_DATA, "L4a-SarcasmSamplesTokenizer-SavedData.jld2");

Fill me in

In [39]:
save(path_to_save_file, Dict("tokendictionary" => tokendictionary, 
        "dataset" => dataset, "number_of_records" => number_of_records));