In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
df = pd.read_excel("/content/drive/MyDrive/IDSIA Biomedical Texts/All sources dataset.xlsx")

In [5]:
pd.set_option('display.max_columns', 500)

In [6]:
df.head()

Unnamed: 0,urls,text,source,label
0,https://www.quora.com/What-are-panic-attacks-l...,"It’s Friday night. You had a long day, and all...",Quora,Panic
1,https://www.quora.com/What-are-panic-attacks-l...,I have been dealing with these for quite some ...,Quora,Panic
2,https://www.quora.com/What-are-panic-attacks-l...,"I was walking to school, but was running late ...",Quora,Panic
3,https://www.quora.com/What-are-panic-attacks-l...,"For me, panic attacks come two ways: randomly ...",Quora,Panic
4,https://www.quora.com/What-are-panic-attacks-l...,"This is a portion of the handout on anxiety, p...",Quora,Panic


In [7]:
df['source'].value_counts()

nomorepanic.co           4158
Quora                    1633
Beyond Blue Forums       1062
Reddit                    700
anxietycommunity          518
www.nomorepanic.co.uk       1
Name: source, dtype: int64

In [8]:
df[df['source'] == "www.nomorepanic.co.uk"]

Unnamed: 0,urls,text,source,label
3520,,He's not here yet Annie :( I'm ready for my bed!!,www.nomorepanic.co.uk,Panic


In [9]:
df.loc[3520,['source']] = 'nomorepanic.co'

In [10]:
df['source'].value_counts()

nomorepanic.co        4159
Quora                 1633
Beyond Blue Forums    1062
Reddit                 700
anxietycommunity       518
Name: source, dtype: int64

In [11]:
df['label'].value_counts()

Panic      5102
Anxiety    2981
Name: label, dtype: int64

## Checking for duplicates

In [12]:
df[df.duplicated(subset = "text", keep=False) == True]  # note 169 rows are duplicated just based on text

Unnamed: 0,urls,text,source,label
116,https://www.quora.com/What-are-panic-attacks-l...,THWACK. They’re sudden. They hit you. I rarely...,Quora,Panic
340,https://www.quora.com/What-was-your-first-pani...,It was November of 2018. I woke up the middle ...,Quora,Panic
341,https://www.quora.com/What-was-your-first-pani...,I was six or seven. I didn’t know what was hap...,Quora,Panic
342,https://www.quora.com/What-was-your-first-pani...,This was about 10 years ago. I had rescued a l...,Quora,Panic
343,https://www.quora.com/What-was-your-first-pani...,I was in a doctor’s office. I was trying to ge...,Quora,Panic
...,...,...,...,...
7810,,,Beyond Blue Forums,Anxiety
7932,,"Hi Jay,\nI have read and re-read your post bec...",Beyond Blue Forums,Anxiety
7933,,"Hi Jay,\nI have read and re-read your post bec...",Beyond Blue Forums,Anxiety
8042,,,Beyond Blue Forums,Anxiety


In [13]:
df[df['text'].str.contains('THWACK. They’re sudden.')==True]

Unnamed: 0,urls,text,source,label
116,https://www.quora.com/What-are-panic-attacks-l...,THWACK. They’re sudden. They hit you. I rarely...,Quora,Panic
812,https://www.quora.com/What-does-anxiety-feel-l...,THWACK. They’re sudden. They hit you. I rarely...,Quora,Anxiety


In [14]:
df.loc[116]

urls      https://www.quora.com/What-are-panic-attacks-l...
text      THWACK. They’re sudden. They hit you. I rarely...
source                                                Quora
label                                                 Panic
Name: 116, dtype: object

In [15]:
df.loc[812]

urls      https://www.quora.com/What-does-anxiety-feel-l...
text      THWACK. They’re sudden. They hit you. I rarely...
source                                                Quora
label                                               Anxiety
Name: 812, dtype: object

#### Quora Panic and Quora Anxiety both have the same text marked as panic and anxiety.

In [16]:
df[df.duplicated(keep=False) == True]  # keep=Fase means all duplicate values set as True
# note 160 rows are duplicates overall

Unnamed: 0,urls,text,source,label
340,https://www.quora.com/What-was-your-first-pani...,It was November of 2018. I woke up the middle ...,Quora,Panic
341,https://www.quora.com/What-was-your-first-pani...,I was six or seven. I didn’t know what was hap...,Quora,Panic
342,https://www.quora.com/What-was-your-first-pani...,This was about 10 years ago. I had rescued a l...,Quora,Panic
343,https://www.quora.com/What-was-your-first-pani...,I was in a doctor’s office. I was trying to ge...,Quora,Panic
344,https://www.quora.com/What-was-your-first-pani...,I was 4 years old… I don't know how that's pos...,Quora,Panic
...,...,...,...,...
7810,,,Beyond Blue Forums,Anxiety
7932,,"Hi Jay,\nI have read and re-read your post bec...",Beyond Blue Forums,Anxiety
7933,,"Hi Jay,\nI have read and re-read your post bec...",Beyond Blue Forums,Anxiety
8042,,,Beyond Blue Forums,Anxiety


In [17]:
df[df['text'].str.contains('I was six or seven.')==True]

Unnamed: 0,urls,text,source,label
341,https://www.quora.com/What-was-your-first-pani...,I was six or seven. I didn’t know what was hap...,Quora,Panic
555,https://www.quora.com/What-was-your-first-pani...,I was six or seven. I didn’t know what was hap...,Quora,Panic


In [18]:
df[df['text'].str.contains('It was November')==True]

Unnamed: 0,urls,text,source,label
340,https://www.quora.com/What-was-your-first-pani...,It was November of 2018. I woke up the middle ...,Quora,Panic
554,https://www.quora.com/What-was-your-first-pani...,It was November of 2018. I woke up the middle ...,Quora,Panic


In [19]:
df.drop_duplicates(keep='first', inplace=True)

In [20]:
df.reset_index(inplace=True, drop=True)

In [21]:
df.head()

Unnamed: 0,urls,text,source,label
0,https://www.quora.com/What-are-panic-attacks-l...,"It’s Friday night. You had a long day, and all...",Quora,Panic
1,https://www.quora.com/What-are-panic-attacks-l...,I have been dealing with these for quite some ...,Quora,Panic
2,https://www.quora.com/What-are-panic-attacks-l...,"I was walking to school, but was running late ...",Quora,Panic
3,https://www.quora.com/What-are-panic-attacks-l...,"For me, panic attacks come two ways: randomly ...",Quora,Panic
4,https://www.quora.com/What-are-panic-attacks-l...,"This is a portion of the handout on anxiety, p...",Quora,Panic


## Removing text which have more than 1024 words

In [22]:
df['Word Count'] = df['text'].apply(lambda x: len(str(x).split()))

In [23]:
df.head()

Unnamed: 0,urls,text,source,label,Word Count
0,https://www.quora.com/What-are-panic-attacks-l...,"It’s Friday night. You had a long day, and all...",Quora,Panic,1036
1,https://www.quora.com/What-are-panic-attacks-l...,I have been dealing with these for quite some ...,Quora,Panic,600
2,https://www.quora.com/What-are-panic-attacks-l...,"I was walking to school, but was running late ...",Quora,Panic,400
3,https://www.quora.com/What-are-panic-attacks-l...,"For me, panic attacks come two ways: randomly ...",Quora,Panic,447
4,https://www.quora.com/What-are-panic-attacks-l...,"This is a portion of the handout on anxiety, p...",Quora,Panic,1582


In [24]:
df['Word Count'].nlargest(55)

1309    5221
452     3496
1586    2460
6895    2130
516     2039
6890    2006
6891    1989
6888    1988
6878    1971
1532    1954
778     1874
671     1859
1438    1658
4       1582
4040    1566
1557    1559
305     1547
196     1491
669     1473
1523    1438
1363    1432
4752    1402
4434    1314
1467    1299
4432    1299
281     1295
902     1278
733     1262
565     1258
1437    1246
786     1242
1204    1236
977     1192
39      1185
429     1181
391     1180
906     1164
887     1119
583     1104
206     1094
4742    1088
1503    1081
4024    1078
1591    1077
4546    1076
4741    1063
4558    1054
0       1036
780     1035
511     1032
731      989
282      982
789      981
6857     981
751      976
Name: Word Count, dtype: int64

In [25]:
len(df[df['Word Count'] >= 1024])

50

In [26]:
a = list(df[df['Word Count'] >= 1024].index)
print(a)

[0, 4, 39, 196, 206, 281, 305, 391, 429, 452, 511, 516, 565, 583, 669, 671, 733, 778, 780, 786, 887, 902, 906, 977, 1204, 1309, 1363, 1437, 1438, 1467, 1503, 1523, 1532, 1557, 1586, 1591, 4024, 4040, 4432, 4434, 4546, 4558, 4741, 4742, 4752, 6878, 6888, 6890, 6891, 6895]


In [27]:
df.drop(a, inplace=True)

In [28]:
df.reset_index(inplace=True, drop=True)

## Removing text which have less than 10 words

In [29]:
pd.set_option('display.max_rows', None)
df[df['Word Count'] < 10]  # 515 rows, too much? with <5 it's 187 rows

Unnamed: 0,urls,text,source,label,Word Count
161,https://www.quora.com/Whats-the-worst-panic-at...,I lost my mom's ring. Couldn't breathe.,Quora,Panic,7
349,https://www.quora.com/How-can-I-tell-if-I-had-...,I am not feeling well.,Quora,Panic,5
419,https://www.quora.com/How-overcome-panic-attac...,Psychotherapy. Any approach you like.,Quora,Panic,5
594,https://socialanxietydisorders.quora.com/Is-an...,It sure feels like it is…,Quora,Anxiety,6
596,https://socialanxietydisorders.quora.com/Is-an...,I think it's the emotional disorder actually,Quora,Anxiety,7
599,https://socialanxietydisorders.quora.com/Is-an...,Yes,Quora,Anxiety,1
631,https://generalizedanxietydisorder.quora.com/I...,Yes it is considered a mental illness\n\nMeera,Quora,Anxiety,8
669,https://www.quora.com/Chemically-and-biologica...,,Quora,Anxiety,1
708,https://www.quora.com/Why-is-anxiety-so-common...,Things move so much faster and social media.,Quora,Anxiety,8
1242,https://www.quora.com/Whats-the-real-cause-beh...,"Genetics, experience and social influences.",Quora,Anxiety,5


In [30]:
pd.set_option('display.max_rows', 20)

In [31]:
b = list(df[df['Word Count'] < 10].index)
print(b)

[161, 349, 419, 594, 596, 599, 631, 669, 708, 1242, 1356, 1502, 1503, 1556, 1570, 1580, 1583, 1584, 1586, 1594, 1601, 1635, 1642, 1649, 1651, 1652, 1672, 1673, 1709, 1710, 1712, 1716, 1728, 1737, 1747, 1750, 1755, 1762, 1777, 1779, 1780, 1781, 1782, 1786, 1812, 1815, 1832, 1834, 1850, 1857, 1865, 1868, 1873, 1875, 1878, 1879, 1882, 1883, 1886, 1887, 1898, 1900, 1907, 1909, 1912, 1915, 1917, 1918, 1921, 1922, 1923, 1924, 1926, 1927, 1938, 1941, 1942, 1943, 1944, 1947, 1948, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1959, 1961, 1962, 1963, 1967, 1969, 1977, 1981, 1987, 1992, 1994, 1998, 1999, 2000, 2002, 2005, 2007, 2009, 2010, 2014, 2020, 2026, 2030, 2031, 2033, 2034, 2036, 2038, 2040, 2045, 2046, 2049, 2051, 2052, 2057, 2061, 2062, 2063, 2072, 2086, 2095, 2096, 2099, 2100, 2102, 2104, 2106, 2107, 2109, 2111, 2148, 2153, 2155, 2156, 2157, 2163, 2187, 2190, 2193, 2196, 2198, 2205, 2206, 2207, 2209, 2224, 2229, 2234, 2244, 2246, 2257, 2259, 2270, 2273, 2278, 2279, 2287, 2296, 2307, 2317, 

In [32]:
df.drop(b, inplace=True)
df.reset_index(inplace = True, drop=True)

In [33]:
df

Unnamed: 0,urls,text,source,label,Word Count
0,https://www.quora.com/What-are-panic-attacks-l...,I have been dealing with these for quite some ...,Quora,Panic,600
1,https://www.quora.com/What-are-panic-attacks-l...,"I was walking to school, but was running late ...",Quora,Panic,400
2,https://www.quora.com/What-are-panic-attacks-l...,"For me, panic attacks come two ways: randomly ...",Quora,Panic,447
3,https://www.quora.com/What-are-panic-attacks-l...,"I’m driving home, riding a tingly high from sa...",Quora,Panic,524
4,https://www.quora.com/What-are-panic-attacks-l...,It feels like your body turns into a black hol...,Quora,Panic,323
...,...,...,...,...,...
7398,,"Hi Ttrung,\nYour story is very similar to my o...",Beyond Blue Forums,Anxiety,236
7399,,Hi startingnew \nThanks for replying I am tryi...,Beyond Blue Forums,Anxiety,20
7400,,Hi Mspurple \nThanks for replying I am trying ...,Beyond Blue Forums,Anxiety,19
7401,,Hi T4 \nThats ok many many people are on medic...,Beyond Blue Forums,Anxiety,39


## Getting Word Embeddings

In [34]:
!pip install torch
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m110.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transforme

In [35]:
import torch
from transformers import BertTokenizer, BertModel

In [36]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name, output_hidden_states=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [37]:
dftemp = df.sample(100)
dftemp

Unnamed: 0,urls,text,source,label,Word Count
3476,,Originally Posted by Laura123\n\t\t\t\t\t\n\t\...,nomorepanic.co,Panic,204
3717,,"Carnation, you said that there was nothing act...",nomorepanic.co,Panic,138
2997,,You guys both need this tonight so I am sendin...,nomorepanic.co,Panic,12
3099,,I think you ladies need to jump in bed with yo...,nomorepanic.co,Panic,56
2387,,Originally Posted by Annie0904\n\t\t\t\t\t\n\t...,nomorepanic.co,Panic,25
...,...,...,...,...,...
561,https://www.quora.com/How-can-I-handle-panic-a...,I recommend you talk to a psychiatrist as your...,Quora,Panic,36
4927,,[Wow!] Scoot - well done!! It sounds like you'...,nomorepanic.co,Panic,34
5547,,My God Rick....\n\nI take my hat off to you......,nomorepanic.co,Panic,166
6384,,"I have a constant lump in the throat, or itchy...",Beyond Blue Forums,Anxiety,73


In [38]:
dftemp['label'].value_counts()

Panic      63
Anxiety    37
Name: label, dtype: int64

In [39]:
# Dataset of texts
texts = list(dftemp['text'])

In [40]:
# Tokenize and convert texts to input IDs
input_ids = []
attention_masks = []

In [41]:
df['Word Count'].nlargest(185)

706     989
275     982
760     981
6314    981
725     976
       ... 
437     513
841     511
3505    509
3852    505
1235    503
Name: Word Count, Length: 185, dtype: int64

In [42]:
for text in texts:
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512, 
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])

In [43]:
len(input_ids)

100

In [44]:
#input_ids[3116]

In [45]:
#tokenizer.decode(input_ids[3116])

In [46]:
#len(input_ids[3116])

In [47]:
#input_ids[3116]

In [48]:
len(attention_masks[0])

1

In [49]:
#attention_masks[3116]

In [50]:
# Concatenate input tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

In [51]:
input_ids

tensor([[  101,  2761,  6866,  ...,     0,     0,     0],
        [  101,  2482,  9323,  ...,     0,     0,     0],
        [  101,  2017,  4364,  ...,     0,     0,     0],
        ...,
        [  101,  2026,  2643,  ...,     0,     0,     0],
        [  101,  1045,  2031,  ...,     0,     0,     0],
        [  101, 16183,  2082,  ...,     0,     0,     0]])

In [52]:
len(input_ids)

100

In [53]:
# Pass input tensors through BERT model
with torch.no_grad():  # torch.no_grad tells PyTorch not to construct the compute graph during this forward pass (since we won’t be running backprop here)–this just reduces memory consumption and speeds things up a little.
    outputs = model(input_ids, attention_mask=attention_masks)
# took 4 min to run

In [54]:
# Extract word embeddings
embeddings = outputs.hidden_states[-1]  # Last hidden state contains word embeddings

In [55]:
len(embeddings)

100

In [56]:
dftemp['Word Count'].nsmallest(10)

3564    10
2580    11
2997    12
2917    12
3434    13
6971    14
1543    14
1656    14
5294    14
5647    15
Name: Word Count, dtype: int64

In [57]:
embeddings[53]

tensor([[ 0.4870, -0.2173,  0.4159,  ..., -0.1632,  0.5009,  0.3217],
        [ 0.3818,  0.6468,  1.5867,  ...,  0.2153,  1.0498, -0.2891],
        [ 1.4473,  0.2803,  1.4300,  ..., -0.3733,  0.8431, -0.0112],
        ...,
        [ 0.2396, -0.1113,  0.5492,  ...,  0.4184,  0.0998, -0.3385],
        [ 0.2562, -0.1854,  0.5391,  ...,  0.3876,  0.1187, -0.3114],
        [ 0.2400, -0.0605,  0.6813,  ...,  0.1833,  0.0223, -0.5453]])

In [58]:
len(embeddings[53])

512

In [59]:
len(embeddings[53][0])

768

In [60]:
# Print word embeddings for each text
for i, text in enumerate(texts):
    print(f"Text: {text}")
    print(f"Word Embeddings: {embeddings[i]}")
    print()

Text: Originally Posted by Laura123
					
				
				ok so this is how i make regular chilli
 
Lean steak mince, brown it in a pan with olive oil, add a chopped onion and a clove of garlic, top up with boiling water just to reach top of the mince.  Now add about 2 tablespoons of tomato puree and a beef stock cube.  Add abnout 20 drops tobasco sauce, pepper, about 3 teaspoons oregano and the same of dried or fresh parsely, a tin of kidney beans (drained) and then chilli powder to your own taste, finally a knob of butter to add a bit of richness to the sauce,  leave to bubble away for about 45 mins!  Voila,  delicious xxx

---------- Post added at 20:15 ---------- Previous post was at 20:14 ----------

i have a great recipe for chilli brisket of beef, which is also amazing, its a slow cook in the oven for 5 or 6 hours though,  and you need a fair amound of spices.  let me know if you want it x

---------- Post added at 20:16 ---------- Previous post was at 20:15 ----------

pop a red and g

## Logistic Regression on Word Embeddings and labels

In [61]:
import matplotlib.pyplot as plt
%matplotlib inline 

In [62]:
dftemp.head()

Unnamed: 0,urls,text,source,label,Word Count
3476,,Originally Posted by Laura123\n\t\t\t\t\t\n\t\...,nomorepanic.co,Panic,204
3717,,"Carnation, you said that there was nothing act...",nomorepanic.co,Panic,138
2997,,You guys both need this tonight so I am sendin...,nomorepanic.co,Panic,12
3099,,I think you ladies need to jump in bed with yo...,nomorepanic.co,Panic,56
2387,,Originally Posted by Annie0904\n\t\t\t\t\t\n\t...,nomorepanic.co,Panic,25


In [63]:
from sklearn.model_selection import train_test_split

In [64]:
embeddings_list = embeddings.tolist()

In [65]:
# embeddings_list   # dont run, takes a lot of time just to print

In [66]:
dftemp['embeddings_list'] = embeddings_list

In [67]:
dftemp.head()

Unnamed: 0,urls,text,source,label,Word Count,embeddings_list
3476,,Originally Posted by Laura123\n\t\t\t\t\t\n\t\...,nomorepanic.co,Panic,204,"[[-0.4584769904613495, -0.3796450197696686, 0...."
3717,,"Carnation, you said that there was nothing act...",nomorepanic.co,Panic,138,"[[0.25902029871940613, -0.25859329104423523, -..."
2997,,You guys both need this tonight so I am sendin...,nomorepanic.co,Panic,12,"[[0.11584289371967316, 0.29215824604034424, 0...."
3099,,I think you ladies need to jump in bed with yo...,nomorepanic.co,Panic,56,"[[0.3095615804195404, -0.05129864066839218, 0...."
2387,,Originally Posted by Annie0904\n\t\t\t\t\t\n\t...,nomorepanic.co,Panic,25,"[[0.08861112594604492, -0.20689493417739868, 0..."


In [68]:
l = [   [1,2], [3,4], [5,6,7]       ]
flat_list = [item for sublist in l for item in sublist]
flat_list

[1, 2, 3, 4, 5, 6, 7]

In [69]:
#flat_list_embeddings = [item for sublist in l for item in sublist]

In [70]:
flat_list_embeddings = []
for sublist in l:
    for item in sublist:
        flat_list_embeddings.append(item)

In [71]:
len(dftemp.index)

100

In [72]:
print(list(range(0, len(dftemp.index))))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


In [73]:
flat_list_embeddings = []
for i in list(range(0, len(dftemp.index))):
  templist = []
  for j in dftemp['embeddings_list'].iloc[i]:
    for item in j:
      templist.append(item)
  flat_list_embeddings.append(templist)

In [74]:
#dftemp['flat_list_embeddings'] = dftemp['embeddings_list'].apply(lambda x: item for sublist in l for item in sublist)

In [75]:
dftemp['flat_list_embeddings'] = flat_list_embeddings

In [76]:
dftemp.head()

Unnamed: 0,urls,text,source,label,Word Count,embeddings_list,flat_list_embeddings
3476,,Originally Posted by Laura123\n\t\t\t\t\t\n\t\...,nomorepanic.co,Panic,204,"[[-0.4584769904613495, -0.3796450197696686, 0....","[-0.4584769904613495, -0.3796450197696686, 0.3..."
3717,,"Carnation, you said that there was nothing act...",nomorepanic.co,Panic,138,"[[0.25902029871940613, -0.25859329104423523, -...","[0.25902029871940613, -0.25859329104423523, -0..."
2997,,You guys both need this tonight so I am sendin...,nomorepanic.co,Panic,12,"[[0.11584289371967316, 0.29215824604034424, 0....","[0.11584289371967316, 0.29215824604034424, 0.1..."
3099,,I think you ladies need to jump in bed with yo...,nomorepanic.co,Panic,56,"[[0.3095615804195404, -0.05129864066839218, 0....","[0.3095615804195404, -0.05129864066839218, 0.1..."
2387,,Originally Posted by Annie0904\n\t\t\t\t\t\n\t...,nomorepanic.co,Panic,25,"[[0.08861112594604492, -0.20689493417739868, 0...","[0.08861112594604492, -0.20689493417739868, 0...."


In [77]:
len(dftemp['flat_list_embeddings'].iloc[0])

393216

In [78]:
512*768

393216

## Logistic Regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dftemp['flat_list_embeddings'], dftemp['label'], test_size=0.3, random_state=42)

In [None]:
# Converting all to 2D
X_train = X_train.values.reshape(-1,1)
y_train = y_train.values.reshape(-1,1)
X_test = X_test.values.reshape(-1,1)
y_test = y_test.values.reshape(-1,1)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression()

In [None]:
len(X_train)

In [None]:
len(y_train)

In [None]:
len(embeddings_list[99][511])

In [None]:
clf.fit(X_train, y_train)  # X one should be 2D, y me 1D 2D both works

In [None]:
y_pred = clf.predict(X_test)

In [None]:
y_pred

In [None]:
X_test    # 62 so they would buy insurance so 1, 29 so they won't so 0

In [None]:
clf.score(X_test, y_test)   # Since it is small data, we got perfect accuracy

In [None]:
from sklearn.metrics import r2_score
score=r2_score(y_test,y_pred)
score