# Import Modules

In [59]:
import pandas 
import re
import nltk
from langdetect import detect

# Fetch Dataset

In [22]:
#current directory containing dataset
directory = "Dataset/"

#read training set
train_csv = pandas.read_csv(directory+"mediaeval-2015-trainingset.txt", sep="	")
train_df = pandas.DataFrame(data = train_csv)

#read test set
test_csv = pandas.read_csv(directory+"mediaeval-2015-testset.txt", sep="	")
test_df = pandas.DataFrame(data = test_csv)


# Data Characterisation

In [23]:
#Dataset information, mainly different fields and size
train_df.info()
print()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14277 entries, 0 to 14276
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweetId     14277 non-null  int64 
 1   tweetText   14277 non-null  object
 2   userId      14277 non-null  int64 
 3   imageId(s)  14277 non-null  object
 4   username    14277 non-null  object
 5   timestamp   14277 non-null  object
 6   label       14277 non-null  object
dtypes: int64(2), object(5)
memory usage: 780.9+ KB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweetId     3755 non-null   int64 
 1   tweetText   3755 non-null   object
 2   userId      3755 non-null   int64 
 3   imageId(s)  3755 non-null   object
 4   username    3755 non-null   object
 5   timestamp   3755 non-null   object
 6   label       3755 non-null   object
dtypes: int64

In [24]:
#shape of training/test sets
print(train_df.shape)
print(test_df.shape)

(14277, 7)
(3755, 7)


In [25]:
#example of dataset entities
train_csv.head()

Unnamed: 0,tweetId,tweetText,userId,imageId(s),username,timestamp,label
0,263046056240115712,¿Se acuerdan de la película: “El día después d...,21226711,sandyA_fake_46,iAnnieM,Mon Oct 29 22:34:01 +0000 2012,fake
1,262995061304852481,@milenagimon: Miren a Sandy en NY! Tremenda i...,192378571,sandyA_fake_09,CarlosVerareal,Mon Oct 29 19:11:23 +0000 2012,fake
2,262979898002534400,"Buena la foto del Huracán Sandy, me recuerda a...",132303095,sandyA_fake_09,LucasPalape,Mon Oct 29 18:11:08 +0000 2012,fake
3,262996108400271360,Scary shit #hurricane #NY http://t.co/e4JLBUfH,241995902,sandyA_fake_29,Haaaaarryyy,Mon Oct 29 19:15:33 +0000 2012,fake
4,263018881839411200,My fave place in the world #nyc #hurricane #sa...,250315890,sandyA_fake_15,princess__natt,Mon Oct 29 20:46:02 +0000 2012,fake


In [36]:
#events categorised by ImageID (train)
print("Train:")
print(train_df["imageId(s)"].str.split('_').str[0].value_counts()) 

Train:
imageId(s)
sandyA                9696
sandyB                2622
boston                 546
malaysia               501
sochi                  402
columbianChemicals     185
bringback              131
underwater             112
passport                46
pigFish                 14
elephant                13
livr                     9
Name: count, dtype: int64


In [37]:
#events categorised by ImageID (test)
print("Test:")
print(test_df["imageId(s)"].str.split('_').str[0].value_counts()) 

Test:
imageId(s)
syrianboy     1769
nepal         1353
eclipse        277
samurai        218
garissa         77
varoufakis      61
Name: count, dtype: int64


In [57]:
#example tweets for different events

event = "syrian" #change variable for different events

total_df = train_df + test_df
selected_tweets = total_df[total_df["imageId(s)"].str.contains(event, na=False)]["tweetText"].head(30)
print(selected_tweets)

1925    @80tinkerbell : El huracán Sandy llevó un tibu...
1926    «@FrasesFaceTwit «RT@NairobiG20El huracán Sand...
1927    RT @DeboConfesarQue El huracán Sandy llevó un ...
1928    Que miedoo :/!!!! RT @Riete: TIBURÓN!!! TIBURÓ...
1929    El huracán Sandy llevó un tiburón hasta las ca...
1930    La naturaleza no juega papá \nEl huracán #Sand...
1931    Vaya tela... Por lo visto el huracan Sandy ha ...
1932    Omg o_O RT @DjBelma: El huracán Sandy llevó un...
1933    Rt“@DeboConfesarQue El huracán Sandy llevó un ...
1934    “@britisacult: El Huracán #Sandy arrastra tibu...
1935    En  New Jersey,el huracan Sandy lleva los tibu...
1936    El huracán Sandy llevó un tiburón hasta las ca...
1937    Tiburones en las calles de New Jersey debido a...
1938    Y así se pasean los tiburones por las calles d...
1939    El huracán ''Sandy'' llevó un tiburón a las ca...
1940    RT @DeboConfesarQue: El huracán Sandy llevó un...
1941    El huracán Sandy llevó un tiburón a las carret...
1942    El Hur

In [62]:
#number of languages in training set
trainLangs = {}

for text in train_df["tweetText"]:
    try:
        lan = detect(text)
    except:
        pass
        lan = "Unknown"
    trainLangs[lan] = trainLangs.get(lan, 0) + 1
  

print(trainLangs)

{'es': 1297, 'en': 10949, 'sq': 9, 'ru': 61, 'it': 98, 'no': 38, 'fr': 214, 'mk': 2, 'nl': 88, 'sk': 11, 'pt': 156, 'de': 129, 'tl': 310, 'sv': 42, 'cy': 129, 'id': 177, 'ja': 22, 'ar': 80, 'vi': 14, 'ca': 35, 'so': 125, 'hu': 6, 'fi': 14, 'pl': 36, 'lt': 6, 'af': 73, 'el': 5, 'da': 26, 'he': 1, 'hr': 7, 'bg': 6, 'tr': 29, 'zh-cn': 10, 'ro': 8, 'fa': 4, 'sl': 9, 'sw': 10, 'et': 12, 'ko': 6, 'th': 19, 'cs': 1, 'Unknown': 1, 'hi': 1, 'lv': 1}


In [64]:
#number of languages in test set
testLangs = {}

for text in test_df["tweetText"]:
    try:
        lan = detect(text)
    except:
        pass
        lan = "Unknown"
    testLangs[lan] = testLangs.get(lan, 0) + 1
  

print(testLangs)

{'en': 2783, 'fr': 32, 'es': 61, 'ca': 9, 'it': 17, 'pt': 36, 'nl': 21, 'ro': 1, 'fi': 3, 'id': 5, 'sq': 4, 'ar': 180, 'af': 3, 'hr': 1, 'tr': 12, 'hi': 5, 'ta': 1, 'el': 2, 'so': 510, 'tl': 3, 'pl': 2, 'te': 2, 'ru': 2, 'th': 2, 'sv': 3, 'et': 1, 'vi': 2, 'ko': 1, 'ja': 3, 'cy': 5, 'lt': 1, 'de': 41, 'da': 1}


# Data Preprocessing

# Model Desgin

# Model Training and Testing

# Evaluation