In [1]:
#Import revelant libraries
import pandas as pd
import numpy as np
import nltk
import string
import re
#Downloading revelant content
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
# Tensorflow (Model)
import tensorflow as tf

## Importing Dataset

**Dataset consists of two types set:**
* Training Set
* Test or Validation Set

In [2]:
# Renaming Columns name 
colnames=['TweetId', 'Entity', 'Output', 'Tweet']

data = pd.read_csv('../input/twitter-entity-sentiment-analysis/twitter_validation.csv', names=colnames, header=None)

In [3]:
#Main list consists of cleaned data
main = []

# Storing all punctuations using RE library like !;,"% etc
re_puncs = re.compile('[%s]' % re.escape(string.punctuation))
# Storing all stop words like a, an, the, when, there, this etc
stop_word  = set(stopwords.words('english'))
# Making Lemmatizing object
lem = WordNetLemmatizer()
# Using Porter Stemmer
p_stem = PorterStemmer()

# Traversing whole dataset
for i in tqdm(range(len(data['Tweet']))):
    # Tokenization
    tokens = word_tokenize(str(data['Tweet'][i]))
    # Converting all characters to lower case
    tokens = [w.lower() for w in tokens]
    # Remove all punctuations from sentenses
    tokens = [re_puncs.sub('', w) for w in tokens]
    # Checking all words is alphabets or not
    tokens = [i for i in tokens if i.isalpha()]
    # Removing all stop words from the sentenses
    tokens = [w for w in tokens if w not in stop_word]
    # Doing Lemmatizing of words
    tokens = [lem.lemmatize(w) for w in tokens]
    # Stemming process
    tokens = [p_stem.stem(w) for w in tokens]
    # Finally convert to string
    r = ' '.join(tokens)
    # Storing the final string into main list
    main.append(r)

In [4]:
# Display the first five rows of dataframe
data.head()

In [5]:
# Assigning a new column in the dataframe
data['Preprocess_Tweet'] = main
# Dropping old column which consists unstructred data =
data = data.drop('Tweet', axis = 1)

In [6]:
# Display the first five rows of dataframe
data.head()

In [7]:
# Converting the main into Vector using CountVectorizer and then convert it to array
cnt = CountVectorizer(analyzer="word")
X = cnt.fit_transform(main).toarray()

In [8]:
# Unique values in output column
data['Output'].unique()

In [9]:
# Checking any NULL values
data['Output'].isnull().sum()

In [10]:
# As output has four output so we convert labels to binary numbers
enc = OneHotEncoder(handle_unknown='ignore')
ip = np.array(data['Output'])
ip = ip.reshape(-1, 1)
y = enc.fit_transform(ip).toarray()

In [11]:
#Printing X (independent values)
X

In [12]:
# Priniting (Dependent values)
y

In [13]:
# Splitting data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [14]:
# Display training set (Independent Values)
X_train

In [15]:
# Display training set (dependent values)
y_train