# Loading Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
import emoji

from IPython.display import Markdown as md
plt.style.use('ggplot')

In [None]:
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

In [None]:
!nvidia-smi #GPU 연결 확인

# Loading Data

In [None]:
train_path = "../input/tweet-sentiment-extraction/train.csv"
test_path = "../input/tweet-sentiment-extraction/test.csv"
sample_submission_path = "../input/tweet-sentiment-extraction/sample_submission.csv"

In [None]:
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
submission = pd.read_csv(sample_submission_path)

In [None]:
#Explore Data

In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_test.info()

In [None]:
print('Training data shape: ', df_train.shape)
print('Testing data shape: ', df_test.shape)

### Missing Values treatment in the dataset

In [None]:
#Missing values in training set
df_train.isnull().sum()

In [None]:
#Missing values in test set
df_test.isnull().sum()

In [None]:
#Drop missing value
df_train.dropna(axis = 0, how ='any',inplace=True) ;
df_train.isnull().sum()

# Calculating and analyzing Char length of each text

In [None]:
df_train['Char_length'] = df_train['text'].apply(len)

In [None]:
df_train.head()

In [None]:
sns.distplot(df_train["Char_length"], kde=False)

#Step 1: Contraction Mapping / Expanding Contractions

In [None]:
!pip install contractions
import contractions

In [None]:
df_train['no_contract'] = df_train['text'].apply(lambda x: [contractions.fix(word) for word in x.split()])

In [None]:
df_train.head()

In [None]:
df_train["msg_str"] = [' '.join(map(str, l)) for l in df_train['no_contract']]

In [None]:
df_train.head()

### Tokenization

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

In [None]:
df_train['tokenized'] = df_train['msg_str'].apply(word_tokenize)
df_train.head()

### Noise cleaning

In [None]:
df_train['lower'] = df_train['tokenized'].apply(lambda x: [word.lower() for word in x])
df_train.head()

In [None]:
import string
punc = string.punctuation
df_train['no_punc'] = df_train['lower'].apply(lambda x: [word for word in x if word not in punc])
df_train.head()

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Stop words

In [None]:
df_train['stopwords_removed'] = df_train['no_punc'].apply(lambda x: [word for word in x if word not in stop_words])
df_train.head()

# Stemming/Lemmatization

In [None]:
nltk.download('averaged_perceptron_tagger')

In [None]:
df_train['pos_tags'] = df_train['stopwords_removed'].apply(nltk.tag.pos_tag)
df_train.head()

In [None]:
nltk.download('wordnet')
from nltk.corpus import wordnet

In [None]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
df_train['wordnet_pos'] = df_train['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])
df_train.head()

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
wnl = WordNetLemmatizer()
df_train['lemmatized'] = df_train['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])
df_train.head()

In [None]:
df_train.to_csv('twitter_sentiment_preprocessing_v1.csv')

# Preprocessing

In [None]:
pip install text-preprocessing

In [None]:
# text preprocessing helper functions

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


def text_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    #remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    combined_text = ' '.join(tokenized_text)
    return combined_text

In [None]:
# Applying the cleaning function to both test and training datasets
df_train['text_clean'] = df_train['text'].apply(str).apply(lambda x: text_preprocessing(x))
df_test['text_clean'] = df_test['text'].apply(str).apply(lambda x: text_preprocessing(x))
df_train.head()

In [None]:
df_train.to_csv('twitter_sentiment_preprocessing_v1.csv')

# Padding

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['text_clean'])
encoded = tokenizer.texts_to_sequences(df_train['text_clean'])
print(encoded)

In [None]:
max_len = max(len(item) for item in encoded)
print(max_len)

In [None]:
for item in encoded: # 각 문장에 대해서
    while len(item) < max_len:   # max_len보다 작으면
        item.append(0)

padded_np = np.array(encoded)
padded_np

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
encoded = tokenizer.texts_to_sequences(df_train['text_clean'])
print(encoded)

In [None]:
padded = pad_sequences(encoded)
padded

# One-hot encoding

In [None]:
from tensorflow.keras.utils import to_categorical

In [None]:
one_hot = to_categorical(padded)
print(one_hot)