## Day 85 Lecture 1 Assignment

In this assignment, we will learn how to use the other layers to improve our model performance.

In [77]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras.layers import Input, Dense, Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model

We will explore a dataset containing information about twitter users and will detect whether or not the user is a bot.

In [2]:
twitter_original = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/training_data_2_csv_UTF.csv')

In [3]:
twitter_original.head()

Unnamed: 0,id,id_str,screen_name,location,description,url,followers_count,friends_count,listed_count,created_at,favourites_count,verified,statuses_count,lang,status,default_profile,default_profile_image,has_extended_profile,name,bot
0,8.16e+17,"""815745789754417152""","""HoustonPokeMap""","""Houston, TX""","""Rare and strong PokŽmon in Houston, TX. See m...","""https://t.co/dnWuDbFRkt""",1291,0,10,"""Mon Jan 02 02:25:26 +0000 2017""",0,False,78554,"""en""","{\r ""created_at"": ""Sun Mar 12 15:44:04 +0...",True,False,False,"""Houston PokŽ Alert""",1
1,4843621000.0,4843621225,kernyeahx,"Templeville town, MD, USA",From late 2014 Socium Marketplace will make sh...,,1,349,0,2/1/2016 7:37,38,False,31,en,,True,False,False,Keri Nelson,1
2,4303727000.0,4303727112,mattlieberisbot,,"Inspired by the smart, funny folks at @replyal...",https://t.co/P1e1o0m4KC,1086,0,14,Fri Nov 20 18:53:22 +0000 2015,0,False,713,en,"{'retweeted': False, 'is_quote_status': False,...",True,False,False,Matt Lieber Is Bot,1
3,3063139000.0,3063139353,sc_papers,,,,33,0,8,2/25/2015 20:11,0,False,676,en,Construction of human anti-tetanus single-chai...,True,True,False,single cell papers,1
4,2955142000.0,2955142070,lucarivera16,"Dublin, United States",Inspiring cooks everywhere since 1956.,,11,745,0,1/1/2015 17:44,146,False,185,en,,False,False,False,lucarivera16,1


In [4]:
twitter_original.shape

(2797, 20)

Start by getting rid of all columns that are not useful.

In [5]:
# Answer below:
#drop rows for languages besides english
twitter = twitter_original.copy()[twitter_original['lang'].str.contains("en")]

In [6]:
twitter = twitter.drop(['id', 'id_str', 'screen_name', 'location', 'url', 'created_at', 'status', 'name', 'lang'], 1)
# twitter.info()

Next, get rid of all columns that contain more than 30% missing data. After that, remove all rows containing at least one missing observation.

In [7]:
# Answer below:
(twitter.isnull().sum()/twitter.isnull().count()).sort_values(ascending=False)

description              0.135963
has_extended_profile     0.036385
bot                      0.000000
default_profile_image    0.000000
default_profile          0.000000
statuses_count           0.000000
verified                 0.000000
favourites_count         0.000000
listed_count             0.000000
friends_count            0.000000
followers_count          0.000000
dtype: float64

In [8]:
twitter = twitter.dropna()

In [9]:
twitter.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2220 entries, 0 to 2796
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   description            2220 non-null   object
 1   followers_count        2220 non-null   int64 
 2   friends_count          2220 non-null   int64 
 3   listed_count           2220 non-null   int64 
 4   favourites_count       2220 non-null   int64 
 5   verified               2220 non-null   bool  
 6   statuses_count         2220 non-null   int64 
 7   default_profile        2220 non-null   bool  
 8   default_profile_image  2220 non-null   bool  
 9   has_extended_profile   2220 non-null   object
 10  bot                    2220 non-null   int64 
dtypes: bool(3), int64(6), object(2)
memory usage: 162.6+ KB


In [10]:
twitter.head()

Unnamed: 0,description,followers_count,friends_count,listed_count,favourites_count,verified,statuses_count,default_profile,default_profile_image,has_extended_profile,bot
0,"""Rare and strong PokŽmon in Houston, TX. See m...",1291,0,10,0,False,78554,True,False,False,1
1,From late 2014 Socium Marketplace will make sh...,1,349,0,38,False,31,True,False,False,1
2,"Inspired by the smart, funny folks at @replyal...",1086,0,14,0,False,713,True,False,False,1
4,Inspiring cooks everywhere since 1956.,11,745,0,146,False,185,False,False,False,1
5,Just a guy trying to do good by telling everyo...,1,186,0,0,False,11,True,False,True,1


Now we will use our embedding functions from a previous assignment.

In [35]:
from nltk.corpus import stopwords
import re
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('stopwords')

stemmer = PorterStemmer()

def remove_stopwords(input_text):
        stopwords_list = stopwords.words('english')
        # Some words which might indicate a certain sentiment are kept via a whitelist
        whitelist = ["n't", "not", "no"]
        words = input_text.split() 
        clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
        return " ".join(clean_words)       

def remove_mentions(input_text):
  return re.sub(r'@\w+', '', input_text)

def stem_list(word_list):
    stemmed = []
    for word in word_list:
        stemmedword = stemmer.stem(word)
        stemmed.append(stemmedword)
    return stemmed

def normalize(terms):
    terms = terms.lower()
    terms = remove_stopwords(terms)
    terms = remove_mentions(terms)
    word_delimiters = u'[\\[\\]\n.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013 ]'
    term_list = re.split(word_delimiters, terms)
    trimmed = [x.rstrip() for x in term_list]
    stemmed = stem_list(trimmed)
    space = ' '
    normed = space.join(stemmed)
    normed = normed.replace('  ', ' ')
    return normed

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


We will create two branches, one branch will process the text data in the description and the other will process all other columns. First, create a numpy array with the encoded data from the description column. Normalize each description, one hot encode the text, pad the row and create a numpy array.

In [36]:
# Answer below:
twitter['text_norm'] = twitter['description'].apply(normalize)

In [87]:
vocab_size = len(set(twitter['text_norm'].str.split().sum()))
vocab_size

6741

In [49]:
twitter['encoded'] = twitter['text_norm'].apply(one_hot, args=[vocab_size])

In [52]:
independent_vars = pad_sequences(twitter['encoded'].values)
independent_vars

array([[   0,    0,    0, ..., 2117, 1798,  893],
       [   0,    0,    0, ..., 1126, 6698, 6685],
       [   0,    0,    0, ..., 1556, 5404,  169],
       ...,
       [   0,    0,    0, ...,    0,    0, 4603],
       [   0,    0,    0, ..., 3758, 5094, 6357],
       [   0,    0,    0, ..., 4500, 2280, 6463]], dtype=int32)

Convert all boolean variables to numeric (zero for false and 1 for true)

In [53]:
twitter.head()

Unnamed: 0,description,followers_count,friends_count,listed_count,favourites_count,verified,statuses_count,default_profile,default_profile_image,has_extended_profile,bot,text_norm,encoded
0,"""Rare and strong PokŽmon in Houston, TX. See m...",1291,0,10,0,False,78554,True,False,False,1,rare strong pokžmon houston tx see pokžmon ht...,"[2820, 1045, 651, 796, 1859, 1411, 651, 1642, ..."
1,From late 2014 Socium Marketplace will make sh...,1,349,0,38,False,31,True,False,False,1,late 2014 socium marketplac make shop fundamen...,"[2928, 469, 6149, 3226, 4201, 588, 5440, 4847,..."
2,"Inspired by the smart, funny folks at @replyal...",1086,0,14,0,False,713,True,False,False,1,inspir smart funni folk _ñ¢ tweet monday wedn...,"[1314, 6353, 2373, 5018, 5404, 5953, 6097, 150..."
4,Inspiring cooks everywhere since 1956.,11,745,0,146,False,185,False,False,False,1,inspir cook everywher sinc 1956,"[1314, 1344, 326, 6628, 350]"
5,Just a guy trying to do good by telling everyo...,1,186,0,0,False,11,True,False,True,1,guy tri good tell everyon els get better,"[2657, 5437, 6505, 318, 2317, 5671, 6464, 6061]"


In [70]:
# Answer below:
non_text = twitter.drop(['description', 'text_norm', 'encoded', 'bot'], 1)

count_cols = non_text.filter(regex='_count$', axis=1).columns

profile_cols = non_text[non_text.columns.difference(count_cols)].columns

In [71]:
count_cols, profile_cols

(Index(['followers_count', 'friends_count', 'listed_count', 'favourites_count',
        'statuses_count'],
       dtype='object'),
 Index(['default_profile', 'default_profile_image', 'has_extended_profile',
        'verified'],
       dtype='object'))

In [72]:
non_text[profile_cols] = np.where(non_text[profile_cols]== True, 1, 0)

Min max scale the data decribing each user (do not min max scale the word embeddings).

In [75]:
# Answer below:
scaler = MinMaxScaler()
non_text_scaled = scaler.fit_transform(non_text)

Now we'll create the two branches. Create a model for the numeric data that consists of 3 dense layers. An input layer and two hidden layers of size 32.

In [82]:
# Answer below:
non_text_scaled.shape

(2220, 9)

In [81]:
input_layer1 = Input(shape=(non_text_scaled.shape[1],))
dense11 = Dense(32, activation='relu')(input_layer1)
dense12 = Dense(32, activation='relu')(dense11)

Create the second branch of the model using the encoded words. This branch will consist of 4 layers: An input layer, an embedding layer returning data of dimension 100, an LSTM layer of unit size 32 and a dense layer of unit size 32. 

In [88]:
# Answer below:
max_words = np.max(independent_vars)+1
independent_vars.shape, max_words

((2220, 66), 6741)

In [90]:
input_layer2 = Input(shape=(independent_vars.shape[1],))
embed1 = Embedding(max_words, 100, input_length=independent_vars.shape[1])(input_layer2)
lstml = LSTM(32)(embed1)
dense21 = Dense(32, activation='relu')(lstml)

Merge the two models using the `concatenate` function (merge the two final dense layers in each branch) and create an output dense layer.

In [91]:
# Answer below:
merge = concatenate([dense12, dense21])

output = Dense(1, activation='sigmoid')(merge)

Create a model using the two inputs and the single output and print the summary

In [92]:
# Answer below: 
model = Model(inputs=[input_layer1, input_layer2], outputs=output)

model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            [(None, 66)]         0                                            
__________________________________________________________________________________________________
input_5 (InputLayer)            [(None, 9)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 66, 100)      674100      input_6[0][0]                    
__________________________________________________________________________________________________
dense_11 (Dense)                (None, 32)           320         input_5[0][0]                    
____________________________________________________________________________________________

Compile and fit the model using the appropriate optimizer, loss, and metrics. Train the model for 10 epochs with a batch size of 128.

In [95]:
# Answer below:
target = twitter['bot']

In [96]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

epochs = 10
batch_size = 128

model.fit([non_text_scaled, independent_vars], target, 
          batch_size=batch_size, epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f1da02fe668>

#Lecture Notes

In [23]:
from tensorflow.keras.layers import Input, Dense, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model

In [24]:
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical

(X_train, y_train), (X_test, y_test) = mnist.load_data()

input_dim = 784 #28x28
output_dim = num_classes = 10 #number of classes 0-9
batch_size = 128
num_epochs = 20

X_train = X_train.reshape(60000,784).astype('float32')
X_test = X_test.reshape(10000, input_dim).astype('float32')
X_train /= 255
X_test /= 255

y_train = to_categorical(y_train, num_classes)
y_test= to_categorical(y_test, num_classes)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [25]:
#create input
inputs = Input(shape=(784,))

#create a hidden layer
h1 = Dense(64, activation='relu')(inputs)
h2 = Dense(64, activation='relu')(h1)

#output
outputs = Dense(10, activation='softmax')(h2)

model = Model(inputs=inputs, outputs=outputs)

In [26]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 784)]             0         
_________________________________________________________________
dense (Dense)                (None, 64)                50240     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_2 (Dense)              (None, 10)                650       
Total params: 55,050
Trainable params: 55,050
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=10, batch_size=80,
          validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
#one input with two outputs
input_layer = Input(shape=(100, 1))
extract = LSTM(10)(input_layer)

#first branch
path1 = Dense(10, activation='relu')(extract)
path12 = Dense(20, activation='relu')(path1)
path13 = Dense(10, activation='relu')(path12)

#second branch
output_layer2 = Dense(10, activation='softmax')(extract)

#output
output_layer1 = Dense(1, activation='sigmoid')(path13)

model = Model(inputs=input_layer, outputs=[output_layer1, output_layer2])

plot_model(model, to_file='one_input_two_outputs.png')
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 100, 1)]     0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 10)           480         input_2[0][0]                    
__________________________________________________________________________________________________
dense_3 (Dense)                 (None, 10)           110         lstm[0][0]                       
__________________________________________________________________________________________________
dense_4 (Dense)                 (None, 20)           220         dense_3[0][0]                    
____________________________________________________________________________________________

In [29]:
# model.fit(x=X_train, y=[y_train1, y_train2])

In [30]:
from tensorflow.keras.layers import Flatten, Conv2D, MaxPool2D, concatenate, Dropout

In [31]:
#two inputs with one output
input_layer1 = Input(shape=(64,64,1))
conv1 = Conv2D(32, kernel_size=4, activation='relu')(input_layer1)
pool1 = MaxPool2D(pool_size=(2,2))(conv1)
flat1 = Flatten()(pool1)

input_layer2 = Input(shape=(32,32,3))
conv2 = Conv2D(32, kernel_size=4, activation='relu')(input_layer2)
pool2 = MaxPool2D(pool_size=(2,2))(conv2)
flat2 = Flatten()(pool2)

merge = concatenate([flat1, flat2])

dense1 = Dense(10, activation='relu')(merge)
dropout = Dropout(0.5)(dense1)
dense2 = Dense(10, activation='relu')(dropout)
output = Dense(1, activation='sigmoid')(dense2)

model = Model(inputs=[input_layer1, input_layer2], outputs=output)

plot_model(model, to_file='two_inputs_one_output.png')
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 64, 64, 1)]  0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 32, 32, 3)]  0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 61, 61, 32)   544         input_3[0][0]                    
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 29, 29, 32)   1568        input_4[0][0]                    
____________________________________________________________________________________________

In [32]:
# model.fit(x=[X_train1, X_train2], y=y_train)