In [5]:
%matplotlib notebook
#Module to handle regular expressions
import re
#Library for emoji
import emoji
#Import pandas and numpy to handle data
import pandas as pd
import numpy as np

#import libraries for accessing the database
import psycopg2
from sqlalchemy import create_engine
from postgres_credentials import *

#import libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns

#Import nltk to check english lexicon
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords

#import libraries for tokenization and ML
import json;
import keras;
import keras.preprocessing.text as kpt;
from keras.preprocessing.text import Tokenizer;


#Import all libraries for creating a deep neural network
#Sequential is the standard type of neural network with stackable layers
from keras.models import Sequential;
#Dense: Standard layers with every node connected, dropout: avoids overfitting
from keras.layers import Dense, Dropout, Activation;

In [2]:
#Querying the database
def query_database(tabletweets):
    engine = create_engine("postgresql+psycopg2://%s:%s@%s:%d/%s" %(usertwitter, passwordtwitter, hosttwitter, porttwitter, dbnametwitter))
    table = pd.read_sql_query('select * from %s' %tabletweets,con=engine, index_col='id')
    return table

In [3]:
#preprocess text in tweets by removing links, @UserNames, blank spaces, etc.
def preprocessing_text(table):
    #put everythin in lowercase
    table['tweet'] = table['tweet'].str.lower()
    #Replace rt indicating that was a retweet
    table['tweet'] = table['tweet'].str.replace('rt', '')
    #Replace occurences of mentioning @UserNames
    table['tweet'] = table['tweet'].replace(r'@\w+', '', regex=True)
    #Replace links contained in the tweet
    table['tweet'] = table['tweet'].replace(r'http\S+', '', regex=True)
    table['tweet'] = table['tweet'].replace(r'www.[^ ]+', '', regex=True)
    #remove numbers
    table['tweet'] = table['tweet'].replace(r'[0-9]+', '', regex=True)
    #replace special characters and puntuation marks
    table['tweet'] = table['tweet'].replace(r'[!"#$%&()*+,-./:;<=>?@[\]^_`{|}~]', '', regex=True)
    return table    

In [4]:
def in_dict(word):
    if wordnet.synsets(word):
        #if the word is in the dictionary, we'll return True
        return True

def replace_elongated_word(word):
    regex = r'(\w*)(\w+)\2(\w*)'
    repl = r'\1\2\3'    
    if in_dict(word):
        return word
    new_word = re.sub(regex, repl, word)
    if new_word != word:
        return replace_elongated_word(new_word)
    else:
        return new_word

def detect_elongated_words(row):
    regexrep = r'(\w*)(\w+)(\2)(\w*)'
    words = [''.join(i) for i in re.findall(regexrep, row)]
    for word in words:
        if not in_dict(word):
            row = re.sub(word, replace_elongated_word(word), row)
    return row 

In [6]:
def filling_words(table):
    filler_words = stopwords.words('english')
    table['tweet'] = df['tweet'].str.lower()
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (filler_words)]))

In [14]:
def cleaning_table(table):
    tweet_table = preprocessing_text(tweet_table)
    tweet_table['tweet'] = tweet_table['tweet'].apply(lambda x: detect_elongated_words(x))
    tweet_table = filling_words(table)
    return table

In [23]:
#Processing the data: Tokenization
def tokenization_tweets(table):
    tokenization = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True, split=' ')
    tokenization.fit_on_texts(table['tweet'])
    return tokenization

In [None]:
#Visualizing the data

In [33]:
#Step2: Create a Neural Network

In [5]:
#Create the model
def train():
    model_nn = Sequential()
    model.add(Dense(512, input_shape=(max_words,), activation='relu'))
    model.add(Dropout(0.5))
    model.compile(loss='categorical_crossentropy',
                 optimizer='adam',
                 metrics=['acuracy'])
    model.fit(X_train, Y_train,
             batch_size=32,
             epochs=5,
             verbose=1,
             validation_split=0.1,
             shuffle=True)
    pred=model.predict(input_data)
    labels[np.argmax(pred)], pred[0][np.argmax(pred)]

In [12]:
if __name__ == "__main__":
    tabletweets = 'tweets_avengers'
    tweet_table = query_database(tabletweets)
    tweet_table = cleaning(tweet_table)

In [13]:
tweet_table.head()

Unnamed: 0_level_0,created_at,tweet,user_id,retweetcount
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2018-12-19 01:15:31,video lee sugeun youtube channel subscriber...,271496866,0
2,2018-12-19 01:15:35,avengers trailer as told by spongebob and f...,2885234679,0
3,2018-12-19 01:15:42,ok so i just watched avengers infinity war for...,14365631,0
4,2018-12-19 01:15:42,always good to know has your back,797228831031508992,0
5,2018-12-19 01:15:44,making new oreo products used to be so easy ...,1037117267396501504,0
