# Example: Loading and Analyzing the Sarcasm Dataset
Fill me

## Setup
We set up the computational environment by including [the `Include. jl` file](Include.jl) using [the `include(...)` method](https://docs.julialang.org/en/v1/base/base/#Base.include). The [`Include.jl` file](Include.jl) loads external packages and functions we will use in these examples. 
* For additional information on functions and types used in this example, see the [Julia programming language documentation](https://docs.julialang.org/en/v1/). 

In [3]:
include("Include.jl");

## Task 1: Load the sarcasm dataset
Fill me in

In [5]:
dataset = joinpath(_PATH_TO_DATA, "Sarcasm_Headlines_Dataset_v2.txt") |> corpus;

Fill me in

In [7]:
dataset.records[1]

MySarcasmRecordModel(true, "thirtysomething scientists unveil doomsday clock of hair loss", "https://www.theonion.com/thirtysomething-scientists-unveil-doomsday-clock-of-hai-1819586205")

In [8]:
typeof(dataset.records)

Dict{Int64, MySarcasmRecordModel}

## Task 2: Build the tokens dictionary
Fill me in

Fill me in

In [11]:
tokenarray = Array{String,1}();
for (k,v) ∈ dataset.records

    # process headline data -
    headline = v.headline;
    tokens = split(headline, " ") .|> String;

    # process -
    for token ∈ tokens

        # strip any leading or trailing spaces -
        token = strip(token, ' ');
        
        if (in(token, tokenarray) == false && isempty(token) == false)
            push!(tokenarray, token);
        end
    end 
end

Fill me in

In [13]:
tokenarray |> sort!

29662-element Vector{String}:
 "#"
 "##"
 "#1"
 "#11717"
 "#4"
 "#5"
 "#addcandytoamovie"
 "#addclimatechangetotv"
 "#alohahuffpost"
 "#alternativefacts"
 "#badpicturemonday"
 "#blacklivesmatter"
 "#brownribboncampaign"
 ⋮
 "zoologists"
 "zoomed"
 "zoos"
 "zoroastrianism"
 "zs"
 "zsa"
 "zucker"
 "zuckerberg"
 "zuckerbergs"
 "zz"
 "éclairs"
 "ünited"

In [14]:
"jacquie" ∈ tokenarray 

true

Fill me in

In [16]:
tokendictionary = Dict{String, Int64}();
for i ∈ eachindex(tokenarray)
    key = tokenarray[i]
    tokendictionary[key] = i; 
end

In [17]:
tokendictionary

Dict{String, Int64} with 29662 entries:
  "rosecolored"           => 22746
  "trumpland"             => 27336
  "irreplaceable"         => 14091
  "cluelessly"            => 5597
  "syriaalthough"         => 26086
  "bumbum"                => 4204
  "#addclimatechangetotv" => 8
  "dumber"                => 8590
  "daraya"                => 7072
  "jakrapong"             => 14213
  "henry"                 => 12550
  "skylight"              => 24355
  "bidder"                => 3239
  "abducted"              => 934
  "rises"                 => 22533
  "hampshire"             => 12131
  "droogs"                => 8492
  "whiz"                  => 29009
  "buffetts"              => 4152
  "il"                    => 13307
  "celebfilled"           => 4868
  "msnbc"                 => 17516
  "starches"              => 25204
  "tribunal"              => 27224
  "lovers"                => 15869
  ⋮                       => ⋮

Fill me in

In [19]:
dataset.tokens = tokendictionary;

## Task 3: Tokenize an headline records
Fill me in

In [21]:
number_of_records = dataset.records |> length; # what is going on here?

Fill me in

In [23]:
random_test_record = rand(1:number_of_records) |> i -> dataset.records[i]

MySarcasmRecordModel(true, "dog a pervert in ways owner will never know", "https://local.theonion.com/dog-a-pervert-in-ways-owner-will-never-know-1834048680")

In [24]:
random_test_record.headline

"dog a pervert in ways owner will never know"

In [25]:
tv = tokenize(random_test_record.headline, dataset.tokens)

9-element Vector{Int64}:
  8202
   913
 19695
 13457
 28769
 19082
 29086
 17968
 14882

### Compute the maximum pad length
Fill me in

In [82]:
max_pad_length = 0;
for i ∈ 1:number_of_records

    # get a record -
    test_record_length = tokenize(dataset.records[i].headline, dataset.tokens) |> length;
    if (test_record_length > max_pad_length)
        max_pad_length = test_record_length;

        @show i
    end
end

i = 1
i = 2
i = 11
i = 14
i = 37
i = 97
i = 106
i = 189
i = 584
i = 1238
i = 1450
i = 2147
i = 7303


### Compute the number of sarcasm and non-sarcasm samples

Fill me in

In [30]:
number_of_sarcasm_samples = 0;
number_of_non_sarcasm_samples = 0;
for i ∈ 1:number_of_records
    
    is_sarcastic_flag = dataset.records[i].issarcastic
    if (is_sarcastic_flag == true)
        number_of_sarcasm_samples += 1;
    else
        number_of_non_sarcasm_samples+=1;
    end
end

### Compute the vector representation of the sarcastic samples
Fill me in

In [84]:
sarcasim_sample_dictionary = Dict{Int64, Array{Int64,1}}();
for i ∈ 1:number_of_records
    is_sarcastic_flag = dataset.records[i].issarcastic
    if (is_sarcastic_flag == true)
        v = tokenize(dataset.records[i].headline, dataset.tokens, 
            pad = max_pad_length);        
        sarcasim_sample_dictionary[i] = v;

        if (i == 7303)
            @show v
        end
    end
end
sarcasim_sample_dictionary[7303]

i = 1
i = 4
i = 5
i = 8
i = 9
i = 15
i = 17
i = 18
i = 22
i = 26
i = 34
i = 35
i = 37
i = 40
i = 45
i = 47
i = 50
i = 52
i = 53
i = 54
i = 55
i = 56
i = 60
i = 61
i = 64
i = 65
i = 66
i = 69
i = 75
i = 76
i = 82
i = 87
i = 88
i = 89
i = 93
i = 94
i = 97
i = 98
i = 101
i = 104
i = 105
i = 108
i = 109
i = 111
i = 114
i = 115
i = 119
i = 120
i = 122
i = 125
i = 127
i = 128
i = 129
i = 131
i = 132
i = 136
i = 137
i = 139
i = 144
i = 145
i = 148
i = 152
i = 153
i = 154
i = 155
i = 157
i = 160
i = 161
i = 163
i = 164
i = 166
i = 169
i = 170
i = 175
i = 178
i = 179
i = 180
i = 183
i = 184
i = 185
i = 187
i = 188
i = 190
i = 192
i = 196
i = 199
i = 204
i = 207
i = 209
i = 210
i = 211
i = 214
i = 219
i = 221
i = 225
i = 226
i = 230
i = 231
i = 232
i = 234
i = 236
i = 237
i = 238
i = 243
i = 244
i = 245
i = 246
i = 248
i = 250
i = 252
i = 254
i = 255
i = 256
i = 259
i = 260
i = 263
i = 268
i = 269
i = 271
i = 272
i = 275
i = 276
i = 288
i = 289
i = 290
i = 291
i = 295
i = 296
i = 297
i = 300
i =

302-element Vector{Int64}:
 12977
 28955
 21418
 18356
 18666
 27018
 10602
 22673
  8365
 21392
 14876
 18982
  8260
     ⋮
     0
     0
     0
     0
     0
     0
     0
     0
     0
     0
     0
     0

### Compute the vector representation of the non-sarcastic samples
Fill me in