In [1]:
# Download NLTK Dependencies
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('vader_lexicon')

!pip install wordcloud
!!pip install twython

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


['Collecting twython',
 '  Downloading https://files.pythonhosted.org/packages/24/80/579b96dfaa9b536efde883d4f0df7ea2598a6f3117a6dd572787f4a2bcfb/twython-3.8.2-py3-none-any.whl',
 'Installing collected packages: twython',
 'Successfully installed twython-3.8.2']

In [2]:
# Data wrangling libraries
import pandas as pd
import numpy as np
from collections import Counter
import re

# Data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

# Feature Preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Predictive modeling
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
import xgboost
from sklearn.model_selection import cross_val_score

# Settings
sns.set_style('whitegrid')
%matplotlib inline

  import pandas.util.testing as tm


In [3]:
train_data = pd.read_csv('/content/drive/My Drive/Projects/Vaccines sentiment/data/Train.csv')
test_data = pd.read_csv('/content/drive/My Drive/Projects/Vaccines sentiment/data/test.csv')
train_data.head()

Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0


In [4]:
# check the number of rows and columns of the data
print (f"There are {train_data.shape[0]} rows and {train_data.shape[1]} columns in the training set.")
print (f"There are {test_data.shape[0]} rows and {test_data.shape[1]} columns in the test set.")

There are 10001 rows and 4 columns in the training set.
There are 5177 rows and 2 columns in the test set.


# Data Exploration

In [5]:
train_data.isnull().sum()

tweet_id     0
safe_text    0
label        1
agreement    2
dtype: int64

In [6]:
train_data.dropna(inplace = True)

In [7]:
train_data = train_data.reset_index().drop('index', axis = 1)

In [8]:
train_data.isnull().sum()

tweet_id     0
safe_text    0
label        0
agreement    0
dtype: int64

# Data Cleaning

## Preprocessing train_data

In [9]:
# here is an example tweet selected randomly
tweet = train_data.loc[0, 'safe_text']
tweet

'Me &amp; The Big Homie meanboy3000 #MEANBOY #MB #MBS #MMR #STEGMANLIFE @ Stegman St. <url>'

In [10]:
# Function to remove punctuation
def removePunctuation(tweet):
    """
    Return the removal of punctuation and other uncommon characters in the tweet.
    
    Parameters
    ----------
        tweet (str): string containing punctuation to be removed.
        
    Returns
    -------
        clean_tweet (str): string without punctuation.
        
    Examples
    --------
    >>> removePunctuation("Hey! Check out this story: urlweb. He doesn't seem impressed. :)")
            
        "Hey Check out this story urlweb He doesn't seem impressed"
    """    
    # first remove line spaces
    # clean_tweet = tweet.replace('\n', ' ')
    clean_tweet = tweet
    
    # substitute digits within text with an empty strring
    clean_tweet = re.sub('\w*\d\w*', ' ', clean_tweet)
    
    # remove punctuation
    # some of the character removed here were determined by visually inspecting the text
    clean_tweet = re.sub(r'[<>:;.,_%()/\{}"?\!&¬¦ãÃâÂ¢\d]', '', clean_tweet) 
    
    # return cleaner tweet
    return clean_tweet

In [11]:
# example implementation
tweet = removePunctuation(tweet)
tweet

'Me amp The Big Homie   #MEANBOY #MB #MBS #MMR #STEGMANLIFE @ Stegman St url'

In [12]:
# finally implement the function across the training data
train_data['tweets'] = train_data['safe_text'].map(removePunctuation)

In [13]:
# Function to generate tweet tokenization
def tweetTokenizer(tweet):
    """
    This method tokenizes and strips handles from twitter data.
    
    Parameters
    ----------
        tweet (str): string to be tokenized.
    Returns
    -------
        tokenized_tweet (list): list of tokens in tweet.
    Examples
    --------
    >>> tweetTokenizer("Read @swrightwestoz's latest on climate change insurance amp lending 
                                       featuring APRA speech and @CentrePolicyDev work urlweb")
    
        ['read',
        'latest',
        'on',
        'climate',
        'change',
        'insurance',
        'amp',
        'lending',
        'featuring',
        'apra',
        'speech',
        'and',
        'work',
        'urlweb']
    """
    tokenizer = TweetTokenizer(preserve_case = False, strip_handles = False)
    tokenized_tweet = tokenizer.tokenize(tweet)
    return tokenized_tweet

In [14]:
# example implementation
tokenized_tweet = tweetTokenizer(tweet)
tokenized_tweet

['me',
 'amp',
 'the',
 'big',
 'homie',
 '#meanboy',
 '#mb',
 '#mbs',
 '#mmr',
 '#stegmanlife',
 '@',
 'stegman',
 'st',
 'url']

In [15]:
# finally implement the function across the training data
train_data['tweets'] = train_data['tweets'].map(tweetTokenizer)

In [19]:
# Function to generate tweet lemmatization
def lemmatizeTweet(tweet):
    """
    Return tweet lemmatizer.
    
    Parameters
    ----------
        tweet (list): tokens to be lemmatized.
        
    Returns
    -------
        lemmatized_tweet (list): lemmatized list of tokens.
        
    Examples
    --------
    >>> lemmatizeTweet(['read',
                        'latest',
                        'on',
                        'climate',
                        'change',
                        'insurance',
                        'amp',
                        'lending',
                        'featuring',
                        'apra',
                        'speech',
                        'and',
                        'work',
                        'urlweb'])
                        
        ['read',
        'latest',
        'climate',
        'change',
        'insurance',
        'lending',
        'featuring',
        'apra',
        'speech',
        'work',
        'urlweb']
    """
    lemmatized_tweet = list()
    lmtzr = WordNetLemmatizer()
    for token in tweet:
        lemmatized_tweet.append(lmtzr.lemmatize(token))
    return lemmatized_tweet

In [21]:
# finally implement the function across the training data
train_data['tweets'] = train_data['tweets'].map(lemmatizeTweet)

In [22]:
train_data['tweets']

0       [me, amp, the, big, homie, #meanboy, #mb, #mbs...
1       [i'm, thinking, of, devoting, my, career, to, ...
2       [#whatcausesautism, vaccine, do, not, vaccinat...
3       [i, mean, if, they, immunize, my, kid, with, s...
4       [thanks, to, user, catch, me, performing, at, ...
                              ...                        
9994    [living, in, a, time, where, the, sperm, i, us...
9995    [user, user, in, spite, of, all, measles, outb...
9996    [interesting, trend, in, child, immunization, ...
9997    [cdc, say, measles, are, at, highest, level, i...
9998    [pneumonia, vaccine, for, woman, w, risk, of, ...
Name: tweets, Length: 9999, dtype: object

In [23]:
train_data['tweets_clean'] = train_data['tweets'].map(lambda tweet: ' '.join(tweet))
train_data.head()

Unnamed: 0,tweet_id,safe_text,label,agreement,tweets,tweets_clean
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0,"[me, amp, the, big, homie, #meanboy, #mb, #mbs...",me amp the big homie #meanboy #mb #mbs #mmr #s...
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0,"[i'm, thinking, of, devoting, my, career, to, ...",i'm thinking of devoting my career to proving ...
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0,"[#whatcausesautism, vaccine, do, not, vaccinat...",#whatcausesautism vaccine do not vaccinate you...
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0,"[i, mean, if, they, immunize, my, kid, with, s...",i mean if they immunize my kid with something ...
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0,"[thanks, to, user, catch, me, performing, at, ...",thanks to user catch me performing at la nuit ...


## Preprocessing test_data

In [34]:
test_data.isnull().sum()
test_data[test_data['safe_text'].isnull()]
test_data.fillna('missing', inplace = True)
test_data[test_data['safe_text'].isnull()]

test_data['tweets'] = test_data['safe_text'].map(removePunctuation)
test_data['tweets'] = test_data['tweets'].map(tweetTokenizer)
test_data['tweets'] = test_data['tweets'].map(lemmatizeTweet)

test_data['tweets_clean'] = test_data['tweets'].map(lambda tweet: ' '.join(tweet))
test_data.head()

Unnamed: 0,tweet_id,safe_text,tweets,tweets_clean
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...,"[user, user, amp, a, vaccine, given, healthy, ...",user user amp a vaccine given healthy peep fda...
1,00UNMD0E,Students starting school without whooping coug...,"[student, starting, school, without, whooping,...",student starting school without whooping cough...
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe...","[i'm, kinda, over, every, ep, of, user, being,...",i'm kinda over every ep of user being ripped f...
3,01HOEQJW,How many innocent children die for lack of vac...,"[how, many, innocent, child, die, for, lack, o...",how many innocent child die for lack of vaccin...
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though...","[cdc, eyeing, bird, flu, vaccine, for, human, ...",cdc eyeing bird flu vaccine for human though r...


# Predictive modeling

## Data preprocessing

In [29]:
X = train_data['tweets_clean']
vec = TfidfVectorizer(ngram_range=(1,2))

X_train = train_data['tweets_clean']
vec = TfidfVectorizer(ngram_range=(1,2))
X_train = vec.fit_transform(X_train)
y_train = train_data['label']

X_test = test_data['tweets_clean']
X_test = vec.transform(X_test)
X_train.shape, X_test.shape

((9999, 81440), (5177, 81440))

## Model Stacking

### Linear Model as a meta learner

In [46]:
svr = SVR(kernel = 'linear')
gbr = GradientBoostingRegressor(n_estimators = 200)
xgb = xgboost.XGBRegressor(n_estimators = 200, learning_rate = 0.08, gamma = 0, subsample = 0.75,
                           colsample_bytree = 1, max_depth = 7)

estimators = [
    ('SVR', svr),
    ('GBR', gbr),
    ('XGB', xgb)
]

final_estimator = LinearRegression()

In [47]:
stacking_regressor  = StackingRegressor(
    estimators = estimators, final_estimator = final_estimator
)

In [48]:
stacking_regressor.fit(X_train, y_train)



StackingRegressor(cv=None,
                  estimators=[('SVR',
                               SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                                   epsilon=0.1, gamma='scale', kernel='linear',
                                   max_iter=-1, shrinking=True, tol=0.001,
                                   verbose=False)),
                              ('GBR',
                               GradientBoostingRegressor(alpha=0.9,
                                                         ccp_alpha=0.0,
                                                         criterion='friedman_mse',
                                                         init=None,
                                                         learning_rate=0.1,
                                                         loss='ls', max_depth=3,
                                                         max_features=None,
                                                         max_leaf_nodes=N...
          

In [49]:
predictions = stacking_regressor.predict(X_test)
submission = pd.DataFrame({'tweet_id':test_data['tweet_id'], 'label':predictions})
submission.head()

Unnamed: 0,tweet_id,label
0,00BHHHP1,-0.158532
1,00UNMD0E,0.601661
2,01AXPTJF,0.127773
3,01HOEQJW,0.803955
4,01JUKMAO,0.147379


In [51]:
submission.to_csv('/content/drive/My Drive/Projects/Vaccines sentiment/FINAL SUBMISSION.csv', index = False)