### Install Dependencies

In [None]:
!pip install <package>

### Import Dependencies

In [None]:
import tensorflow as tf
import pandas as pd
import sagemaker

### Load the tweets csv in a Pandas dataframe using ',' as a separator

In [None]:
df = <enter your code here>

### check the dataset

In [None]:
df.head(5)

### Select relevant columns

In [None]:
tweet_and_sentiment = df[['text','airline_sentiment']]
tweet_and_sentiment.head(5)

### Select only positive and negative tweets from the new DF

In [None]:
tweet_and_sentiment = <enter your code here>
tweet_and_sentiment.head(5)

### Create the Train and Test datsets 75/25%

In [None]:
train = <enter your code here>
test = <enter your code here>

### Convert sentiment label to numeric category using Pandas factorize method.

In [None]:
labels = <enter your code here>
labels

The factorize method converts strings into numeric categories and then keeps string categories as an array of index.  
so in this `Index(['positive', 'negative'], dtype='object'))` positive = 0 and negative = 1

In [None]:
labels_index = labels[1]
train['airline_sentiment'] = labels[0]
train.head(5)

In [None]:
train.to_csv('train.csv', index=False)
test.to_csv('test.csv', index=False)

### Start a SageMaker Session and upload the Train Data using the upload_data method.

Use key_prefix=prefix+'/training' as a parameter of the function

In [None]:
sage_maker_session = <enter your code here>
prefix = 'tensorflow_sentiment_analysis'
training_input_path = sage_maker_session.<enter your code here>

In [None]:
training_input_path

In [None]:
training_data = pd.read_csv(training_input_path, sep=',')

In [None]:
training_data.head(5)

### Create the Training file

In [None]:
%%writefile train.py
import argparse
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense
from tensorflow.keras.layers import Embedding, Dropout
import pandas as pd

if __name__ == '__main__':
    
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--batch-size', type=int, default=100)
    parser.add_argument('--learning-rate', type=float, default=0.1)
    

    parser.add_argument('--gpu-count', type=int, default=os.environ['SM_NUM_GPUS'])

    # input data and model directories
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
    #parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST'])

    args, _ = parser.parse_known_args()
    
    epochs     = args.epochs
    lr         = args.learning_rate
    batch_size = args.batch_size
    gpu_count  = args.gpu_count
    model_dir  = args.model_dir
    training_dir   = args.train
    
    ## get the training data from the CSV file into a Pandas DF, you can use the training_dir argument, use ',' as separator
    
    training_data = <insert your code here>
    tweet = training_data.text.values
    labels = training_data.airline_sentiment.values
    
    num_of_words = 5000
    token = Tokenizer(num_words=num_of_words)
    token.fit_on_texts(tweet)
    
    vocab_size = len(token.word_index) + 1 # 1 is added due to 0 index
    
    tweet_sequence = token.texts_to_sequences(tweet)
    
    max_len = 200
    padded_tweet_sequence = pad_sequences(tweet_sequence, maxlen=max_len)
    
    # Build the model
    embedding_vector_length = 32
    ## Import the sequential model https://www.tensorflow.org/guide/keras/sequential_model withput parameter
    model = <insert your model here>
    
    
    model.add(Embedding(vocab_size, embedding_vector_length, input_length=max_len) )
    model.add(Dropout(0.2))
    model.add(LSTM(100)) 
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    ## add a compile argument to the model with
    ## loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']
    <insert your code here>
    
    
    ## Execute the Model Fit with training data, labels, validation split, epochs, batch_size and level of verbose
    <insert your code here>
    
    ## Save your model using the simple_save --> https://docs.w3cub.com/tensorflow~1.15/saved_model/simple_save
    ## use tf.keras.backend.get_session(), os.path.join(model_dir, '1'), {'inputs': model.input}, {t.name: t for t in model.outputs} as params
    <use your code here>



In [None]:
tf_version = tf.__version__
tf_version

### Start the Training

In [None]:
role = sagemaker.get_execution_role()

### import TemsorFlow from sagemaker package

In [None]:
from sagemaker.tensorflow import TensorFlow




### Build the tf_estimator with the Tensorflow 

Complete the Parameters
https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator

In [None]:
tf_estimator = TensorFlow(entry_point=, 
                          role=,
                          train_instance_count=1, 
                          train_instance_type='ml.c5.18xlarge',
                          framework_version=tf_version, 
                          py_version='py3',
                          script_mode=True,
                          hyperparameters={
                              'epochs': 10,
                              'batch-size': 64
                          }
                         )

In [None]:
training_input_path

In [None]:
tf_estimator.fit({'train': training_input_path})

In [None]:
import time

endpoint_name = 'tensorflow-sentiment-analysis'+time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
end_point = tf_estimator.deploy(initial_instance_count=1,instance_type='ml.m5.4xlarge',endpoint_name=endpoint_name)

In [None]:
print(end_point.endpoint)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_texts(text):
    
    num_of_words = 5000
    token = Tokenizer(num_words=num_of_words)
    token.fit_on_texts(training_data.text.values)
    
    tweet_sequence = token.texts_to_sequences(text)
    
    max_len = 200
    padded_tweet_sequence = pad_sequences(tweet_sequence, maxlen=max_len)
    
    return padded_tweet_sequence

In [None]:
test_texts = preprocess_texts(test.text.values)

In [None]:
import numpy as np

for i, tweet in enumerate(test_texts):
    labels = ['positive','negative']
    print(test.text.values[i])
    prediction = end_point.predict(tweet)['predictions']
    prediction = np.array(prediction).round().item()
    print('Actual sentiment: {} ----- Predicted sentiment  {} \n'.format(test.airline_sentiment.values[i],labels[int(prediction)]))

### Try your own Twwets

### Clean up the workspace

In [None]:
end_point.delete_endpoint()