In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [3]:
df = pd.read_csv('/content/drive/MyDrive/IDSIA Biomedical Texts/AllSource_Intensity_ThirdJuly.csv', low_memory=False)
df.head(1)

Unnamed: 0,urls,text,source,label,WC,Analytic,Clout,Authentic,Tone,WPS,...,all_emo_labels,all_emo_label_rank,anger_intensity,anticipation_intensity,disgust_intensity,fear_intensity,joy_intensity,sadness_intensity,surprise_intensity,trust_intensity
0,https://www.quora.com/What-are-panic-attacks-l...,i have been dealing with these for quite some ...,Quora,1,607,55.22,35.35,48.82,1.0,26.39,...,"['fear', 'nervousness', 'confusion', 'curiosit...","{'fear': 1, 'nervousness': 2, 'confusion': 3, ...",0.415048,0.553423,0.272333,0.568205,0.4095,0.467625,0.4345,0.522773


## Getting 2 panic features

In [4]:
import regex as re

In [5]:
panic_symptoms = ["Palpitations", "Pounding heart", "Accelerated heart rate", "Sweating", "Trembling", "Shaking", "Shortness of breath",
"Smothering", "Feelings of choking", "Chest pain", "Discomfort", "Abdominal distress", "Nausea", "Dizziness", "Unsteadiness", "Lightheadedness",
"Faintness", "Chills", "Heat flashes", "Paresthesia", "Numbness", "Tingling sensations", "Derealization", "Depersonalization", "Fear of losing control",
"Fear of going crazy", "Fear of dying", "Mental images of dying", "Mental images of collapsing", "Agoraphobia", "Need to escape"]


panic_symptoms_ext = ["Palpitations", "Pounding heart", "Accelerated heart rate", "Sweating", "Trembling", "Shaking", "Shortness of breath",
"Smothering", "Feelings of choking", "Chest pain", "Discomfort", "Abdominal distress", "Nausea", "Dizziness", "Unsteadiness", "Lightheadedness",
"Faintness", "Chills", "Heat flashes", "Paresthesia", "Numbness", "Tingling sensations", "Derealization", "Depersonalization", "Fear of losing control",
"Fear of going crazy", "Fear of dying", "Mental images of dying", "Mental images of collapsing", "Agoraphobia", "Need to escape"
"Sweat", "Tremble", "Shake", "Shortage of breath","Feeling of choking","Dizzy",
"Faint","Fainted","Chill","Heat flash", "Numb", "Tingling sensation","Mental image of dying", "Mental image of collapsing"]

In [6]:
def count_symptoms_in_text(text, panic_list):
    if isinstance(text, str):
        count = 0
        for symptom in panic_list:
            match = re.search(r'\b{}\b'.format(symptom), text, re.IGNORECASE)
            if match:
                count += 1
        return count

In [7]:
df['symptoms_count'] = df['text'].apply(lambda x: count_symptoms_in_text(x,panic_symptoms))
df['symptoms_ext_count'] = df['text'].apply(lambda x: count_symptoms_in_text(x,panic_symptoms_ext))

In [8]:
pickle_in = open("/content/drive/MyDrive/IDSIA Biomedical Texts/Sentence Embeddings/AllSource_alldistilrobertav1_via_UMAP_SHORTembeddings.pickle", 'rb')
sentence_embeddings = pickle.load(pickle_in)
sentence_embeddings

array([[10.64194  ,  5.0430765,  5.6824026, ...,  4.9058275,  6.8707986,
         4.538727 ],
       [11.312859 ,  5.364349 ,  4.41365  , ...,  4.92234  ,  6.8475184,
         4.5590596],
       [10.531799 ,  4.894456 ,  5.387705 , ...,  4.8968716,  6.8360796,
         4.530069 ],
       ...,
       [10.346373 ,  4.4247556,  3.5815325, ...,  5.0401225,  6.552696 ,
         4.490976 ],
       [10.454275 ,  4.5640407,  3.6035635, ...,  5.0320673,  6.564847 ,
         4.4918733],
       [11.222271 ,  5.1468487,  4.0054016, ...,  5.079988 ,  6.6341186,
         4.5597043]], dtype=float32)

In [9]:
# Normalize sentence embeddings so that all values are between 0 and 1 (becasue emotions features are between 0 and 1 too)
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler(feature_range=(0,1))  # (0,1) is default

In [10]:
sentence_embeddings = scaling.fit_transform(sentence_embeddings.reshape(-1, 1)).reshape(*sentence_embeddings.shape) # https://stackoverflow.com/questions/75461346/different-result-from-minmaxscaler-with-manual-calculations
sentence_embeddings

array([[0.8199141 , 0.35134387, 0.40484923, ..., 0.33985746, 0.5043063 ,
        0.30913472],
       [0.8760634 , 0.37823123, 0.298667  , ..., 0.3412394 , 0.502358  ,
        0.31083637],
       [0.81069636, 0.33890578, 0.3801859 , ..., 0.33910793, 0.5014007 ,
        0.30841014],
       ...,
       [0.795178  , 0.29959643, 0.22902691, ..., 0.35109666, 0.4776843 ,
        0.30513844],
       [0.80420834, 0.31125325, 0.2308707 , ..., 0.3504225 , 0.4787012 ,
        0.30521354],
       [0.8684821 , 0.3600286 , 0.26450062, ..., 0.354433  , 0.4844986 ,
        0.31089035]], dtype=float32)

In [11]:
sentence_embeddings.shape

(7405, 28)

In [12]:
# Standardization of LIWC features
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [13]:
standardized_liwc = sc.fit_transform(df.loc[:, 'WC':'Emoji'])

In [14]:
# normalization of 2 panic features
normalized_panic_features = scaling.fit_transform(df[['symptoms_count', 'symptoms_ext_count']])

In [15]:
sentence_embeddings

array([[0.8199141 , 0.35134387, 0.40484923, ..., 0.33985746, 0.5043063 ,
        0.30913472],
       [0.8760634 , 0.37823123, 0.298667  , ..., 0.3412394 , 0.502358  ,
        0.31083637],
       [0.81069636, 0.33890578, 0.3801859 , ..., 0.33910793, 0.5014007 ,
        0.30841014],
       ...,
       [0.795178  , 0.29959643, 0.22902691, ..., 0.35109666, 0.4776843 ,
        0.30513844],
       [0.80420834, 0.31125325, 0.2308707 , ..., 0.3504225 , 0.4787012 ,
        0.30521354],
       [0.8684821 , 0.3600286 , 0.26450062, ..., 0.354433  , 0.4844986 ,
        0.31089035]], dtype=float32)

In [16]:
sentemb_column_names = ["sentemb" + str(i+1) for i in range(28)]

In [17]:
sentembdf = pd.DataFrame(sentence_embeddings, columns=sentemb_column_names)
sentembdf

Unnamed: 0,sentemb1,sentemb2,sentemb3,sentemb4,sentemb5,sentemb6,sentemb7,sentemb8,sentemb9,sentemb10,...,sentemb19,sentemb20,sentemb21,sentemb22,sentemb23,sentemb24,sentemb25,sentemb26,sentemb27,sentemb28
0,0.819914,0.351344,0.404849,0.211062,0.258971,0.257296,0.508351,0.079092,0.046841,0.526339,...,0.494592,0.289889,0.115539,0.196822,0.428880,0.426222,0.362514,0.339857,0.504306,0.309135
1,0.876063,0.378231,0.298667,0.214494,0.349709,0.284757,0.438802,0.078256,0.026417,0.481356,...,0.536341,0.269954,0.126838,0.195539,0.443481,0.403223,0.353871,0.341239,0.502358,0.310836
2,0.810696,0.338906,0.380186,0.207599,0.268363,0.274065,0.500595,0.077948,0.052404,0.508967,...,0.497243,0.285798,0.115411,0.195277,0.431129,0.421081,0.363387,0.339108,0.501401,0.308410
3,0.875869,0.375439,0.291701,0.216481,0.355757,0.287865,0.434504,0.078305,0.024242,0.476598,...,0.540063,0.268896,0.127621,0.195539,0.443904,0.402204,0.353474,0.342344,0.501798,0.311278
4,0.849092,0.364154,0.345076,0.144952,0.283609,0.274024,0.446017,0.077871,0.052790,0.488757,...,0.520263,0.277368,0.124095,0.195726,0.446654,0.415476,0.357244,0.337169,0.496036,0.308303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7400,0.808676,0.343545,0.265276,0.106535,0.323924,0.292145,0.382246,0.077382,0.073316,0.436193,...,0.546514,0.260041,0.122214,0.197972,0.459927,0.397550,0.356333,0.341254,0.485487,0.303029
7401,0.824678,0.309808,0.225578,0.204305,0.355065,0.360845,0.426495,0.070627,0.067943,0.404500,...,0.546183,0.259894,0.116986,0.200168,0.441317,0.389815,0.363444,0.354721,0.477989,0.306945
7402,0.795178,0.299596,0.229027,0.193240,0.349450,0.343462,0.422839,0.070439,0.078386,0.409807,...,0.543561,0.260543,0.115634,0.199547,0.446306,0.389455,0.364519,0.351097,0.477684,0.305138
7403,0.804208,0.311253,0.230871,0.186933,0.347010,0.343995,0.420793,0.071775,0.076749,0.414172,...,0.544412,0.260857,0.116057,0.199328,0.447040,0.390905,0.364248,0.350423,0.478701,0.305214


In [18]:
standardized_liwc

array([[ 3.67802528,  0.98063234, -0.11115907, ...,  0.13908545,
        -0.12035728,  0.17485002],
       [ 2.12422261,  0.93614355, -1.01518881, ..., -0.32861383,
        -0.30797786, -0.05320853],
       [ 2.46364552, -0.16631989,  1.42946272, ...,  0.82624363,
        -0.23487893, -0.05320853],
       ...,
       [-0.75710077,  0.25671387, -0.96127635, ..., -0.86826685,
        -0.30797786, -0.05320853],
       [-0.61378887,  0.25671387,  1.36359921, ..., -0.86826685,
        -0.30797786,  1.76434898],
       [ 0.50253537, -0.1292459 ,  0.3546659 , ..., -0.47971667,
        -0.13335265, -0.05320853]])

In [19]:
liwc_column_names = list(df.loc[:, 'WC':'Emoji'].columns)

In [20]:
stdliwcdf = pd.DataFrame(standardized_liwc, columns=liwc_column_names)
stdliwcdf

Unnamed: 0,WC,Analytic,Clout,Authentic,Tone,WPS,BigWords,Dic,Linguistic,function,...,nonflu,filler,AllPunc,Period,Comma,QMark,Exclam,Apostro,OtherP,Emoji
0,3.678025,0.980632,-0.111159,-0.503463,-0.906085,0.507052,1.316973,0.342227,-0.601720,-0.018291,...,-0.181918,-0.109396,0.094036,-0.364488,1.739462,-0.232106,-0.240709,0.139085,-0.120357,0.174850
1,2.124223,0.936144,-1.015189,0.979755,-0.703811,-0.305294,-0.124287,-0.340482,0.082425,0.466392,...,-0.181918,-0.109396,-0.338164,0.031678,0.330742,-0.184911,-0.271948,-0.328614,-0.307978,-0.053209
2,2.463646,-0.166320,1.429463,0.293715,-0.906085,-0.375792,0.031591,0.709276,0.143365,0.234690,...,-0.181918,-0.109396,-0.129423,0.213881,-0.231101,-0.200643,-0.271948,0.826244,-0.234879,-0.053209
3,3.059521,0.021001,-0.971103,0.892855,-0.749503,-0.201164,-0.350683,-0.186322,-0.087749,-0.128231,...,2.806013,-0.109396,0.200414,-0.244134,0.388297,0.281791,-0.271948,1.531390,-0.029390,-0.053209
4,1.535890,-0.332567,-1.023422,0.979755,-0.902879,-0.329225,0.032829,0.298182,0.202006,0.394281,...,-0.181918,-0.109396,-0.017025,0.055080,0.218374,0.171670,-0.271948,1.024116,-0.257621,-0.053209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7400,0.887215,-0.969850,-0.505012,-0.373113,-0.518371,-0.104795,0.415103,0.252301,0.893050,0.616526,...,-0.181918,-0.109396,-0.561623,-0.217388,-0.505171,-0.316007,-0.271948,0.041948,-0.273865,-0.053209
7401,-0.749558,0.374960,-1.023422,-1.102307,1.712520,0.093764,1.210579,0.573468,-1.045551,-2.087060,...,-0.181918,-0.109396,-1.239361,-1.135089,-0.968349,-0.316007,-0.271948,-0.868267,-0.307978,-0.053209
7402,-0.757101,0.256714,-0.961276,0.130851,1.712520,0.029087,0.721912,0.525752,-0.047505,-1.215812,...,-0.181918,-0.109396,-1.239361,-1.135089,-0.968349,-0.316007,-0.271948,-0.868267,-0.307978,-0.053209
7403,-0.613789,0.256714,1.363599,-0.953040,1.532692,0.029087,0.721912,1.008420,-1.257118,-0.594000,...,-0.181918,-0.109396,-1.063404,-0.695462,-0.968349,-0.316007,-0.271948,-0.868267,-0.307978,1.764349


In [21]:
emodf = df.loc[:, 'admiration':'neutral']
emodf

Unnamed: 0,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity.1,desire,disappointment,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,0.000016,0.000150,0.000705,0.001447,0.000643,0.009114,0.048850,0.010012,0.000057,0.000440,...,0.000210,0.068864,0.000185,0.000041,0.000323,0.001267,0.000330,0.000575,0.000205,0.000843
1,0.000054,0.167888,0.011522,0.201494,0.000443,0.001504,0.002532,0.000539,0.000486,0.053486,...,0.003212,0.013415,0.000246,0.000617,0.022932,0.000231,0.003261,0.416074,0.001216,0.003240
2,0.000101,0.000596,0.000428,0.001275,0.006295,0.013178,0.412494,0.034596,0.000124,0.002020,...,0.001481,0.035853,0.000615,0.000224,0.336612,0.002241,0.001373,0.007805,0.013074,0.041802
3,0.000054,0.000639,0.043696,0.003672,0.000041,0.007463,0.000333,0.000094,0.000294,0.002422,...,0.005427,0.010949,0.000076,0.000740,0.000619,0.000275,0.001561,0.057985,0.000332,0.000073
4,0.000004,0.000040,0.000176,0.000860,0.000181,0.000255,0.001674,0.000182,0.000016,0.001535,...,0.000145,0.079422,0.000020,0.000017,0.000338,0.000211,0.000071,0.002956,0.000103,0.000174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7400,0.000907,0.000017,0.000027,0.000027,0.000360,0.001514,0.000162,0.000076,0.000035,0.000038,...,0.000018,0.000038,0.000060,0.000021,0.000021,0.000448,0.000136,0.000022,0.000016,0.000068
7401,0.000451,0.000075,0.000005,0.000022,0.000736,0.000157,0.000070,0.000227,0.000052,0.000015,...,0.000010,0.000004,0.000037,0.000003,0.000018,0.000051,0.000023,0.000008,0.000011,0.001105
7402,0.000228,0.000024,0.000010,0.000034,0.000405,0.000364,0.000123,0.000703,0.000040,0.000015,...,0.000008,0.000007,0.000077,0.000003,0.000009,0.000134,0.000023,0.000008,0.000013,0.000393
7403,0.000064,0.000080,0.000109,0.000155,0.012040,0.932658,0.000593,0.001206,0.000311,0.000126,...,0.000124,0.016793,0.015985,0.000026,0.000285,0.005689,0.000256,0.000404,0.000174,0.003708


In [22]:
intensitydf = df.loc[:, 'anger_intensity':'trust_intensity']
intensitydf

Unnamed: 0,anger_intensity,anticipation_intensity,disgust_intensity,fear_intensity,joy_intensity,sadness_intensity,surprise_intensity,trust_intensity
0,0.415048,0.553423,0.272333,0.568205,0.409500,0.467625,0.434500,0.522773
1,0.530400,0.519750,0.541250,0.432167,0.453429,0.315600,0.247333,0.508875
2,0.428600,0.533500,0.228167,0.526192,0.413444,0.468533,0.348500,0.504500
3,0.567200,0.533462,0.114667,0.501952,0.505000,0.522095,0.320500,0.593615
4,0.487000,0.508000,0.482250,0.624833,0.489167,0.505333,0.000000,0.527167
...,...,...,...,...,...,...,...,...
7400,0.396000,0.609000,0.484000,0.527500,0.434000,0.591000,0.793000,0.540800
7401,0.000000,0.000000,0.000000,0.156000,0.000000,0.000000,0.000000,0.000000
7402,0.000000,0.000000,0.000000,0.156000,0.000000,0.000000,0.000000,0.641000
7403,0.344000,0.528667,0.000000,0.414000,0.515500,0.500000,0.363500,0.613000


In [23]:
final_df = pd.concat([sentembdf, stdliwcdf, emodf, intensitydf, df['symptoms_ext_count']], axis=1)
final_df

Unnamed: 0,sentemb1,sentemb2,sentemb3,sentemb4,sentemb5,sentemb6,sentemb7,sentemb8,sentemb9,sentemb10,...,neutral,anger_intensity,anticipation_intensity,disgust_intensity,fear_intensity,joy_intensity,sadness_intensity,surprise_intensity,trust_intensity,symptoms_ext_count
0,0.819914,0.351344,0.404849,0.211062,0.258971,0.257296,0.508351,0.079092,0.046841,0.526339,...,0.000843,0.415048,0.553423,0.272333,0.568205,0.409500,0.467625,0.434500,0.522773,8
1,0.876063,0.378231,0.298667,0.214494,0.349709,0.284757,0.438802,0.078256,0.026417,0.481356,...,0.003240,0.530400,0.519750,0.541250,0.432167,0.453429,0.315600,0.247333,0.508875,1
2,0.810696,0.338906,0.380186,0.207599,0.268363,0.274065,0.500595,0.077948,0.052404,0.508967,...,0.041802,0.428600,0.533500,0.228167,0.526192,0.413444,0.468533,0.348500,0.504500,3
3,0.875869,0.375439,0.291701,0.216481,0.355757,0.287865,0.434504,0.078305,0.024242,0.476598,...,0.000073,0.567200,0.533462,0.114667,0.501952,0.505000,0.522095,0.320500,0.593615,1
4,0.849092,0.364154,0.345076,0.144952,0.283609,0.274024,0.446017,0.077871,0.052790,0.488757,...,0.000174,0.487000,0.508000,0.482250,0.624833,0.489167,0.505333,0.000000,0.527167,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7400,0.808676,0.343545,0.265276,0.106535,0.323924,0.292145,0.382246,0.077382,0.073316,0.436193,...,0.000068,0.396000,0.609000,0.484000,0.527500,0.434000,0.591000,0.793000,0.540800,1
7401,0.824678,0.309808,0.225578,0.204305,0.355065,0.360845,0.426495,0.070627,0.067943,0.404500,...,0.001105,0.000000,0.000000,0.000000,0.156000,0.000000,0.000000,0.000000,0.000000,0
7402,0.795178,0.299596,0.229027,0.193240,0.349450,0.343462,0.422839,0.070439,0.078386,0.409807,...,0.000393,0.000000,0.000000,0.000000,0.156000,0.000000,0.000000,0.000000,0.641000,0
7403,0.804208,0.311253,0.230871,0.186933,0.347010,0.343995,0.420793,0.071775,0.076749,0.414172,...,0.003708,0.344000,0.528667,0.000000,0.414000,0.515500,0.500000,0.363500,0.613000,0


In [24]:
df['label']

0       1
1       1
2       1
3       1
4       1
       ..
7400    0
7401    0
7402    0
7403    0
7404    0
Name: label, Length: 7405, dtype: int64

In [25]:
print(list(final_df.columns))

['sentemb1', 'sentemb2', 'sentemb3', 'sentemb4', 'sentemb5', 'sentemb6', 'sentemb7', 'sentemb8', 'sentemb9', 'sentemb10', 'sentemb11', 'sentemb12', 'sentemb13', 'sentemb14', 'sentemb15', 'sentemb16', 'sentemb17', 'sentemb18', 'sentemb19', 'sentemb20', 'sentemb21', 'sentemb22', 'sentemb23', 'sentemb24', 'sentemb25', 'sentemb26', 'sentemb27', 'sentemb28', 'WC', 'Analytic', 'Clout', 'Authentic', 'Tone', 'WPS', 'BigWords', 'Dic', 'Linguistic', 'function', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they', 'ipron', 'det', 'article', 'number', 'prep', 'auxverb', 'adverb', 'conj', 'negate', 'verb', 'adj', 'quantity', 'Drives', 'affiliation', 'achieve', 'power', 'Cognition', 'allnone', 'cogproc', 'insight', 'cause', 'discrep', 'tentat', 'certitude', 'differ', 'memory', 'Affect', 'tone_pos', 'tone_neg', 'emotion', 'emo_pos', 'emo_neg', 'emo_anx', 'emo_anger', 'emo_sad', 'swear', 'Social', 'socbehav', 'prosocial', 'polite', 'conflict', 'moral', 'comm', 'socrefs', 'family', 'friend', 'female'

## DNN

In [26]:
!pip install tensorflow



In [27]:
!pip install keras



In [28]:
# import numpy as np
# import pandas as pd
# from keras.preprocessing import sequence
# from keras.models import Sequential
# from keras.layers import Dense, Embedding, Dropout
# from keras.layers import LSTM,Bidirectional,GRU,SimpleRNN
# from keras.layers import Conv2D, MaxPooling2D
# from keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D,MaxPooling1D, AveragePooling1D
# # from keras.layers import Input, merge, Dropout  # Merge is not supported in Keras +2. Instead, you need to use Concatenate layer
# from keras.layers import Input, Concatenate, Dropout
# from keras.models import Model
# import tensorflow as tf
# #tf.python.control_flow_ops = tf
# #from sklearn.cross_validation import train_test_split  # deprecation of cross_validation sub-module
# from sklearn.model_selection import train_test_split
# from scipy.stats import pearsonr
# import timeit

In [29]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [30]:
X = final_df
y = df['label']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
model = Sequential()

In [33]:
model.add(Dense(64, activation='relu', input_shape=(183,)))  # Input layer with 64 units
model.add(Dense(128, activation='relu'))  # Hidden layer with 128 units
model.add(Dense(1, activation='sigmoid'))  # Output layer with 1 unit for binary classification (sigmoid activation)

In [34]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])

In [35]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7b5c2c0cfeb0>

In [36]:
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Test Loss: 0.5538589954376221
Test Accuracy: 0.8183659911155701


In [37]:
predictions = model.predict(X_test)



In [38]:
predictions  # probabilities for postiive class, ie 1 or anxiety

array([[0.9681779 ],
       [0.00153738],
       [0.7921547 ],
       ...,
       [0.28679928],
       [0.06520063],
       [0.99999774]], dtype=float32)

In [39]:
# make class predictions with the model
predictions1 = (model.predict(X_test) > 0.5).astype(int)
predictions1



array([[1],
       [0],
       [1],
       ...,
       [0],
       [0],
       [1]])

In [40]:
# summarize the first 5 cases
for i in range(5):
 #print('%s => %d (expected %d)' % (X.values[i].tolist(), predictions1[i], y[i]))
 print('%d (expected %d)' % (predictions1[i], y[i]))

1 (expected 1)
0 (expected 1)
1 (expected 1)
1 (expected 1)
1 (expected 1)


In [41]:
len(final_df), len(df)

(7405, 7405)

## Adding Dropout layer (with double the increase of neurons in subsequent layers)

In [50]:
X = final_df
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(183,)))  # Input layer with 64 units
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))  # Hidden layer with 128 units
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))  # Output layer with 1 unit for binary classification (sigmoid activation)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))   # epoch=10 and batch size=32 gives almost the same accuracy of 0.82

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7b5bb0985360>

In [51]:
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Test Loss: 0.7281754016876221
Test Accuracy: 0.8143146634101868


Different versions (epoch=30, batch_size=32) -
1. 64 128 - 0.834/0.612 (Accuracy/Loss)
2. 64 128 256 - 0.815/0.658
3. 64 128 256 512 - 0.826/0.613
4. 64 128 256 512 1024 - 0.822/0.665
5. 64 128 256 512 1024 (epoch 30, batch size 10) - 0.814/0.728

## Adding Dense layers (with dropout) with slight increase of neurons in subsequent layers

In [62]:
X = final_df
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = Sequential()
model.add(Dense(32, input_shape=(183,), activation='relu'))  # Hidden layer with 128 units
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))  # Input layer with 64 units
model.add(Dropout(0.2))
model.add(Dense(128, activation='relu'))  # Hidden layer with 128 units
model.add(Dropout(0.2))
model.add(Dense(160, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(240, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(320, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))  # Output layer with 1 unit for binary classification (sigmoid activation)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
model.fit(X_train, y_train, epochs=30, batch_size=10, validation_data=(X_test, y_test))   # epoch=10 and batch size=32 gives almost the same accuracy of 0.82

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7b5b882a9930>

In [63]:
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Test Loss: 0.5634609460830688
Test Accuracy: 0.8143146634101868


Different versions (epoch=30, batch_size=32) -
1. 64 96 128 - 0.825/0.610 (Accuracy/Loss)
2. 64 96 128 160 - 0.821/0.613
3. 64 96 128 160 192 - 0.823/0.628
4. 32 64 96 128 160 192 - 0.808/0.554
5. 32 64 128 160 240 320 (epoch 30, batch size 10) - 0.814/0.563

## Multi-layers (ignore for this file as its anyway not working, check IDSIA Biomedical Texts 19, hopefully it works there)

In [72]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Conv1D, Dense, Concatenate, GlobalMaxPooling1D
from sklearn.model_selection import train_test_split

In [73]:
final_df

Unnamed: 0,sentemb1,sentemb2,sentemb3,sentemb4,sentemb5,sentemb6,sentemb7,sentemb8,sentemb9,sentemb10,...,neutral,anger_intensity,anticipation_intensity,disgust_intensity,fear_intensity,joy_intensity,sadness_intensity,surprise_intensity,trust_intensity,symptoms_ext_count
0,0.819914,0.351344,0.404849,0.211062,0.258971,0.257296,0.508351,0.079092,0.046841,0.526339,...,0.000843,0.415048,0.553423,0.272333,0.568205,0.409500,0.467625,0.434500,0.522773,8
1,0.876063,0.378231,0.298667,0.214494,0.349709,0.284757,0.438802,0.078256,0.026417,0.481356,...,0.003240,0.530400,0.519750,0.541250,0.432167,0.453429,0.315600,0.247333,0.508875,1
2,0.810696,0.338906,0.380186,0.207599,0.268363,0.274065,0.500595,0.077948,0.052404,0.508967,...,0.041802,0.428600,0.533500,0.228167,0.526192,0.413444,0.468533,0.348500,0.504500,3
3,0.875869,0.375439,0.291701,0.216481,0.355757,0.287865,0.434504,0.078305,0.024242,0.476598,...,0.000073,0.567200,0.533462,0.114667,0.501952,0.505000,0.522095,0.320500,0.593615,1
4,0.849092,0.364154,0.345076,0.144952,0.283609,0.274024,0.446017,0.077871,0.052790,0.488757,...,0.000174,0.487000,0.508000,0.482250,0.624833,0.489167,0.505333,0.000000,0.527167,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7400,0.808676,0.343545,0.265276,0.106535,0.323924,0.292145,0.382246,0.077382,0.073316,0.436193,...,0.000068,0.396000,0.609000,0.484000,0.527500,0.434000,0.591000,0.793000,0.540800,1
7401,0.824678,0.309808,0.225578,0.204305,0.355065,0.360845,0.426495,0.070627,0.067943,0.404500,...,0.001105,0.000000,0.000000,0.000000,0.156000,0.000000,0.000000,0.000000,0.000000,0
7402,0.795178,0.299596,0.229027,0.193240,0.349450,0.343462,0.422839,0.070439,0.078386,0.409807,...,0.000393,0.000000,0.000000,0.000000,0.156000,0.000000,0.000000,0.000000,0.641000,0
7403,0.804208,0.311253,0.230871,0.186933,0.347010,0.343995,0.420793,0.071775,0.076749,0.414172,...,0.003708,0.344000,0.528667,0.000000,0.414000,0.515500,0.500000,0.363500,0.613000,0


In [74]:
X_sentemb = final_df.loc[:, 'sentemb1':'sentemb28']
X_sentemb.head(1)

Unnamed: 0,sentemb1,sentemb2,sentemb3,sentemb4,sentemb5,sentemb6,sentemb7,sentemb8,sentemb9,sentemb10,...,sentemb19,sentemb20,sentemb21,sentemb22,sentemb23,sentemb24,sentemb25,sentemb26,sentemb27,sentemb28
0,0.819914,0.351344,0.404849,0.211062,0.258971,0.257296,0.508351,0.079092,0.046841,0.526339,...,0.494592,0.289889,0.115539,0.196822,0.42888,0.426222,0.362514,0.339857,0.504306,0.309135


In [76]:
# putting panic extended feature with liwc features
X_liwc = final_df.loc[:, 'WC':'Emoji']
X_liwc['symptoms_ext_count'] = final_df['symptoms_ext_count']
X_liwc.head(1)

Unnamed: 0,WC,Analytic,Clout,Authentic,Tone,WPS,BigWords,Dic,Linguistic,function,...,filler,AllPunc,Period,Comma,QMark,Exclam,Apostro,OtherP,Emoji,symptoms_ext_count
0,3.678025,0.980632,-0.111159,-0.503463,-0.906085,0.507052,1.316973,0.342227,-0.60172,-0.018291,...,-0.109396,0.094036,-0.364488,1.739462,-0.232106,-0.240709,0.139085,-0.120357,0.17485,8


In [78]:
X_emotions = final_df.loc[:, 'admiration':'neutral']
X_emotions.head(1)

Unnamed: 0,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity.1,desire,disappointment,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,1.6e-05,0.00015,0.000705,0.001447,0.000643,0.009114,0.04885,0.010012,5.7e-05,0.00044,...,0.00021,0.068864,0.000185,4.1e-05,0.000323,0.001267,0.00033,0.000575,0.000205,0.000843


In [79]:
X_intensity = final_df.loc[:, 'anger_intensity':'trust_intensity']
X_intensity.head(1)

Unnamed: 0,anger_intensity,anticipation_intensity,disgust_intensity,fear_intensity,joy_intensity,sadness_intensity,surprise_intensity,trust_intensity
0,0.415048,0.553423,0.272333,0.568205,0.4095,0.467625,0.4345,0.522773


In [80]:
y = df['label']

In [81]:
X_sentemb_train, X_sentemb_test, X_liwc_train, X_liwc_test, X_emotions_train, X_emotions_test, X_intensity_train, X_intensity_test, y_train, y_test = train_test_split(
    X_sentemb, X_liwc, X_emotions, X_intensity, y, test_size=0.2, random_state=42)

In [88]:
# Input for sentemb features
input_sentemb = Input(shape=(28,))
lstm_sentemb = LSTM(64)(input_sentemb)

# Input for LIWC features
input_liwc = Input(shape=(119,))
conv_liwc = Conv1D(128, 3, activation='relu')(input_liwc)
conv_liwc = GlobalMaxPooling1D()(conv_liwc)

# Input for emotions features
input_emotions = Input(shape=(28,))
dense_emotions = Dense(64, activation='relu')(input_emotions)

# Input for intensity features
input_intensity = Input(shape=(8,))
dense_intensity = Dense(32, activation='relu')(input_intensity)

# Concatenate the outputs of all branches
concatenated = Concatenate()([lstm_sentemb, conv_liwc, dense_emotions, dense_intensity])

# Additional Dense layers for further processing
dense1 = Dense(128, activation='relu')(concatenated)
dense2 = Dense(64, activation='relu')(dense1)

# Output layer
output = Dense(1, activation='sigmoid')(dense2)

# Create the model
model = Model(inputs=[input_sentemb, input_liwc, input_emotions, input_intensity], outputs=output)

ValueError: ignored