# Example: Loading and Analyzing the Sarcasm Dataset
Fill me

## Setup
We set up the computational environment by including [the `Include. jl` file](Include.jl) using [the `include(...)` method](https://docs.julialang.org/en/v1/base/base/#Base.include). The [`Include.jl` file](Include.jl) loads external packages and functions we will use in these examples. 
* For additional information on functions and types used in this example, see the [Julia programming language documentation](https://docs.julialang.org/en/v1/). 

In [3]:
include("Include.jl");

## Task 1: Load the sarcasm dataset
Fill me in

In [5]:
corpusmodel = joinpath(_PATH_TO_DATA, "Sarcasm_Headlines_Dataset_v2.txt") |> corpus;

Fill me in

In [7]:
corpusmodel.records[1]

MySarcasmRecordModel(true, "thirtysomething scientists unveil doomsday clock of hair loss", "https://www.theonion.com/thirtysomething-scientists-unveil-doomsday-clock-of-hai-1819586205")

In [8]:
typeof(corpusmodel.records)

Dict{Int64, MySarcasmRecordModel}

## Task 2: Build the tokens dictionary
Fill me in

Fill me in

In [11]:
tokenarray = Array{String,1}();
for (k,v) ∈ corpusmodel.records

    # process headline data -
    headline = v.headline;
    tokens = split(headline, " ") .|> String;

    # process -
    for token ∈ tokens

        # strip any leading or trailing spaces -
        token = strip(token, ' ');
        
        if (in(token, tokenarray) == false && isempty(token) == false)
            push!(tokenarray, token);
        end
    end 
end

Fill me in

In [13]:
tokenarray |> sort!

29662-element Vector{String}:
 "#"
 "##"
 "#1"
 "#11717"
 "#4"
 "#5"
 "#addcandytoamovie"
 "#addclimatechangetotv"
 "#alohahuffpost"
 "#alternativefacts"
 "#badpicturemonday"
 "#blacklivesmatter"
 "#brownribboncampaign"
 ⋮
 "zoologists"
 "zoomed"
 "zoos"
 "zoroastrianism"
 "zs"
 "zsa"
 "zucker"
 "zuckerberg"
 "zuckerbergs"
 "zz"
 "éclairs"
 "ünited"

In [14]:
"jacquie" ∈ tokenarray 

true

Fill me in

In [16]:
tokendictionary = Dict{String, Int64}();
inverse = Dict{Int64, String}();
for i ∈ eachindex(tokenarray)
    key = tokenarray[i]
    tokendictionary[key] = i; 
    inverse[i] = key;
end

In [17]:
tokendictionary

Dict{String, Int64} with 29662 entries:
  "rosecolored"           => 22746
  "trumpland"             => 27336
  "irreplaceable"         => 14091
  "cluelessly"            => 5597
  "syriaalthough"         => 26086
  "bumbum"                => 4204
  "#addclimatechangetotv" => 8
  "dumber"                => 8590
  "daraya"                => 7072
  "jakrapong"             => 14213
  "henry"                 => 12550
  "skylight"              => 24355
  "bidder"                => 3239
  "abducted"              => 934
  "rises"                 => 22533
  "hampshire"             => 12131
  "droogs"                => 8492
  "whiz"                  => 29009
  "buffetts"              => 4152
  "il"                    => 13307
  "celebfilled"           => 4868
  "msnbc"                 => 17516
  "starches"              => 25204
  "tribunal"              => 27224
  "lovers"                => 15869
  ⋮                       => ⋮

Fill me in

In [19]:
corpusmodel.tokens = tokendictionary;
corpusmodel.inverse = inverse;

## Task 3: Tokenize headline records
Fill me in

In [21]:
number_of_records = corpusmodel.records |> length; # what is going on here?

### Look at a random record
Fill me in

In [23]:
random_test_record = rand(1:number_of_records) |> i -> corpusmodel.records[i]

MySarcasmRecordModel(true, "world health organization not sure how but adam levines new fragrance the only antidote to mers virus", "https://www.theonion.com/world-health-organization-not-sure-how-but-adam-levi-1819575159")

In [24]:
random_test_record.headline

"world health organization not sure how but adam levines new fragrance the only antidote to mers virus"

In [25]:
tv = tokenize(random_test_record.headline, corpusmodel.tokens)

LoadError: UndefVarError: `dataset` not defined

### Compute the maximum pad length
Fill me in

In [27]:
max_pad_length = 0;
for i ∈ 1:number_of_records
    test_record_length = tokenize(corpusmodel.records[i].headline, corpusmodel.tokens) |> length;
    if (test_record_length > max_pad_length)
        max_pad_length = test_record_length;
    end
end

LoadError: UndefVarError: `dataset` not defined

### Compute the number of sarcasm and non-sarcasm samples

Fill me in

In [30]:
number_of_sarcasm_samples = 0;
number_of_non_sarcasm_samples = 0;
for i ∈ 1:number_of_records
    
    is_sarcastic_flag = corpusmodel.records[i].issarcastic
    if (is_sarcastic_flag == true)
        number_of_sarcasm_samples += 1;
    else
        number_of_non_sarcasm_samples+=1;
    end
end

### Compute the vector representation of the sarcastic samples
Fill me in

In [32]:
sarcasim_sample_dictionary = Dict{Int64, Array{Int64,1}}();
for i ∈ 1:number_of_records
    is_sarcastic_flag = corpusmodel.records[i].issarcastic
    if (is_sarcastic_flag == true)
        v = tokenize(corpusmodel.records[i].headline, corpusmodel.tokens, 
            pad = max_pad_length);        
        sarcasim_sample_dictionary[i] = v;
    end
end
sarcasim_sample_dictionary[1]

LoadError: UndefVarError: `dataset` not defined

### Compute the vector representation of the non-sarcastic samples
Fill me in

In [34]:
non_sarcasim_sample_dictionary = Dict{Int64, Array{Int64,1}}();
for i ∈ 1:number_of_records
    is_sarcastic_flag = corpusmodel.records[i].issarcastic
    if (is_sarcastic_flag == false)
        v = tokenize(corpusmodel.records[i].headline, corpusmodel.tokens, 
            pad = max_pad_length);        
        non_sarcasim_sample_dictionary[i] = v;
    end
end
non_sarcasim_sample_dictionary

LoadError: UndefVarError: `dataset` not defined

In [35]:
@assert length(non_sarcasim_sample_dictionary) == number_of_non_sarcasm_samples

LoadError: AssertionError: length(non_sarcasim_sample_dictionary) == number_of_non_sarcasm_samples

## Final: Save data to disk
Finally, we did a bunch of stuff in this example, and we don't want to have to recompute the corpus, token dictionary, etc. So let's save it [in an HDF5 encoded binary file](https://en.wikipedia.org/wiki/Hierarchical_Data_Format). To start, specify a path:

In [37]:
path_to_save_file = joinpath(_PATH_TO_DATA, "L4a-SarcasmSamplesTokenizer-SavedData.jld2");

Fill me in

In [39]:
save(path_to_save_file, Dict("tokendictionary" => tokendictionary, 
        "corpus" => corpusmodel, "number_of_records" => number_of_records));