In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# NLP Package
import nltk
from nltk import FreqDist
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.corpus import words
  
# Misc.
import re

#### Hello World in Data Science

In [3]:
df = pd.read_csv('train.csv', sep=',')

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.drop(labels= 'id', axis= 1, inplace= True)

In [None]:
df.head()

In [None]:
sentiment = df['sentiment']
sentiment.value_counts()

In [None]:
g = sns.countplot(sentiment)

#### Text Exploration 

In [None]:
# Raw exploration
corpus = []
for content in df['content']:
    corpus.append(content)
corpus = ' '.join(corpus)

In [None]:
print('Word/Character/Symbol/Anything count in corpus:', len(corpus))

In [None]:
# Tokenization
def tokenizer(text):
    return text.split(' ')

tokenised_corpus = tokenizer(corpus)
print('Number of tokens:', len(tokenised_corpus))
tokenised_unique = set(tokenised_corpus)
print('Number of unique tokens:', len(tokenised_unique))

In [None]:
# Word freqdist
fdist = FreqDist(tokenised_corpus)
fdist.most_common(50)
sorted(fdist.most_common(), key = lambda x: x[1], reverse= False)

All dem stop words... can add into the stopwords library for text preprocessing later. Some stopwords may have sentimental values ie to say, good, not etc..

In [None]:
# Functions for preprocessing
def lowercase(text):
    return [word.lower() for word in text]

def remove_non_letters(text):
    return [re.sub('[^a-z\s]', '', word) for word in text]

def tokenizer(text):
    return text.split(' ')

lemmatizer = WordNetLemmatizer() 
def lemmatization(text):
    # Lemmatization (it's almost always better than stemming...)
    return [lemmatizer.lemmatize(word) for word in text]

nltk_stopwords = stopwords.words("english")
spacy_stopwords = list(STOP_WORDS)
final_stopwords = list(set(nltk_stopwords + spacy_stopwords))
def remove_stopword(text):
    return [word for word in text if word not in final_stopwords]

def remove_whitespace(text):
    return [word for word in text if word != '']

def remove_handle(text):
    return [word for word in text if '@' not in word]

def convert_to_string(text):
    return ' '.join(text)

In [None]:
df['content'] = df['content'].apply(tokenizer)
df['content'] = df['content'].apply(lowercase)
print(df['content'] )

df['content'] = df['content'].apply(remove_handle)
df['content'] = df['content'].apply(remove_non_letters)
print(df['content'])

In [None]:
df['content'] = df['content'].apply(remove_stopword)
df['content'] = df['content'].apply(lemmatization)
df['content'] = df['content'].apply(remove_stopword)
df['content'] = df['content'].apply(remove_whitespace)

In [None]:
df['content']

In [None]:
words_list = words.words()
print(len(words_list))
def filter_gibberish(text):
    return [word for word in text if word in words_list]

In [None]:
df['content'] = df['content'].apply(filter_gibberish)

df['content'] = df['content'].apply(convert_to_string)

In [None]:
df['content']

### Text Processing
1. tf-idf
2. encoding sentiments

In [None]:
# Encoding Sentiments
def sentiments_encoding(sentiment):
    sentiments = ['happy', 'sad', 'neutral', 'fury']
    return sentiments.index(sentiment)

df['sentiment'] = df['sentiment'].apply(sentiments_encoding)

In [None]:
df['sentiment'].value_counts()

In [None]:
df.head()

In [None]:
df.to_csv("cleaned.csv")

In [None]:
test_df = pd.read_csv("cleaned.csv")
test_df.drop('Unnamed: 0', axis= 1, inplace= True)
test_df.dropna(axis=0, inplace= True)
test_df.info()

In [None]:
# Prepare word vector 
cv = CountVectorizer()
words_sparse_matrix = cv.fit_transform(test_df['content'])
sparse_df = pd.DataFrame(columns= list(cv.get_feature_names()), data= words_sparse_matrix.A)
sparse_df

### Model

In [None]:
import tensorflow as tf
import keras
from sklearn.model_selection import train_test_split
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Flatten, Conv1D, MaxPooling1D
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
print("Version: ", tf.__version__) # Check tf version
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE") # Check GPU status
physical_devices = tf.config.experimental.list_physical_devices('GPU') # Config GPU
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
sparse_df

In [None]:
MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 100

tokenizer = Tokenizer()
tokenizer.fit_on_texts(test_df['content'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
X = tokenizer.texts_to_sequences(test_df['content'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

y = test_df['sentiment']
y = to_categorical(y)
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
embedding_dim = 50

model = Sequential()
model.add(Embedding(input_dim=MAX_NB_WORDS, 
                           output_dim=embedding_dim, 
                           input_length=MAX_SEQUENCE_LENGTH))
model.add(Flatten())

model.add(Dense(256, activation='relu'))
model.add(layers.Dropout(0.2))

model.add(Dense(512, activation='relu'))
model.add(layers.Dropout(0.2))

model.add(Dense(1024, activation='relu'))
model.add(layers.Dropout(0.5))

model.add(Dense(4, activation='softmax'))
model.compile(optimizer='adam',
              loss= 'binary_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, y_train,
                    epochs=50,
                    verbose=1,
                    validation_data=(X_test, y_test),
                    batch_size= 64)

In [None]:
model.save('model.h5')

In [None]:
predictions = model.predict()
prediction = pd.DataFrame(predictions, columns=['predictions']).to_csv('prediction.csv')