# Project Notebook

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

## Data exploration

### Opening files and scouting

#### Sentiment140 file

In [None]:
#df_raw_sentiment = pd.read_csv('data/sentiment140.csv', 
#                           engine='python', 
#                           header=None, 
#                           names=['classification', 'tweet_id', 'date', '?', 'account', 'text'])
#df_raw_sentiment.head()

In [None]:
#df_sentiment = df_raw_sentiment[['classification', 'text']]
#df_sentiment.head()

#### Twitter Airline file

In [None]:
#df_raw_airlines = pd.read_csv('data/twitter-airline-sentiment.csv', engine='python')
#df_raw_airlines.head()

In [None]:
#df_airlines = df_raw_airlines[['airline_sentiment', 'text']]
#classification = np.zeros(len(df_airlines), dtype=np.int)
#classification[df_airlines['airline_sentiment']=='negative'] = -1
#classification[df_airlines['airline_sentiment']=='neutral'] = 0
#classification[df_airlines['airline_sentiment']=='positive'] = 1
#df_airlines.insert(loc=2, column='classification', value=classification)
#df_airlines = df_airlines[['classification', 'text']]
#df_airlines.head()

### Cleaning data 

In [None]:
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import re, string

def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

def clean_single(tweet, stop_words = stopwords.words('english'), numbers=True):

    cleaned_tokens = []
    # Removing links
    tweet = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', tweet)
    # Removing @
    tweet = re.sub("(@[A-Za-z0-9_]+)","", tweet)
        
    # Removing currencies
    tweet = re.sub(r'\$\w*', '', tweet)
    
    tokens = word_tokenize(tweet)
    
    for token, tag in pos_tag(tokens):
                
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
            
            # remove tokens containing numbers
    if numbers:
        numbers = [str(i) for i in range(10)]
        kill_list = []
        for number in numbers:
            kill_list = kill_list + [w for w in cleaned_tokens if number in w]
        # removing selected tokens
        cleaned_tokens = [w for w in cleaned_tokens if not w in kill_list]
        
        # merge tokens
        merged = ' '
        merged = merged.join(cleaned_tokens)
        
    return merged

def clean(array):
    for i, phrase in enumerate(array):
        array[i] = clean_single(phrase)
    return array

In [None]:
#airlines_classification = df_airlines.classification.values
#airlines_text = clean(df_airlines.text.values)

#sentiment_classification = df_sentiment.classification.values
#sentiment_text = clean(df_sentiment.text.values)

#print(airlines_classification.shape, airlines_text.shape)
#print(sentiment_classification.shape, sentiment_text.shape)

In [None]:
#temp = pd.DataFrame({'text':airlines_text,'classification':airlines_classification})
#temp.to_csv('data/airlines_cleaned.csv', index=False)
#temp = pd.DataFrame({'text':sentiment_text,'classification':sentiment_classification})
#temp.to_csv('data/sentiment_cleaned.csv', index=False)

#### Opening cleaned data

In [None]:
df_airlines = pd.read_csv('data/airlines_cleaned.csv')
df_airlines.head()

In [None]:
df_sentiment = pd.read_csv('data/sentiment_cleaned.csv')
df_sentiment.head()

### Visualisation

## Classification

### Naive Bayes

### SVM

### Fully connected network

### LSTM network