In [1]:
import nltk
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import resample
import string
from nltk.tokenize import TreebankWordTokenizer
from nltk import SnowballStemmer, PorterStemmer, LancasterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

from nltk.stem import WordNetLemmatizer


#import emoji
#from emoji import emojize

# set plot style
sns.set()

In [2]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test_with_no_labels.csv')

In [3]:
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
df['message'] = df['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)


###test
test['message'] = test['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [4]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [5]:
def remove_punctuation(post):
    return ''.join([l for l in post if l not in string.punctuation])

df['message'] = df['message'].apply(remove_punctuation)



###test
test['message'] = test['message'].apply(remove_punctuation)

In [6]:
# Remove all words below 3 characters
df['newmessage'] = df['message'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
test['newmessage'] =test['message'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [7]:
df.head()

Unnamed: 0,sentiment,message,tweetid,newmessage
0,1,PolySciMajor EPA chief doesnt think carbon dio...,625221,PolySciMajor chief doesnt think carbon dioxide...
1,1,Its not like we lack evidence of anthropogenic...,126103,like lack evidence anthropogenic global warming
2,2,RT RawStory Researchers say we have three year...,698562,RawStory Researchers have three years climate ...
3,1,TodayinMaker WIRED 2016 was a pivotal year in...,573736,TodayinMaker WIRED 2016 pivotal year climate c...
4,1,RT SoyNovioDeTodas Its 2016 and a racist sexis...,466954,SoyNovioDeTodas 2016 racist sexist climate cha...


In [8]:
df['newmessage'] = df['newmessage'].str.lower()
test['newmessage'] = test['newmessage'].str.lower()

In [9]:
tokeniser = TreebankWordTokenizer()
df['tokenised_message'] = df['newmessage'].apply(tokeniser.tokenize)
df['tokenised_message'].iloc[2]

['rawstory',
 'researchers',
 'have',
 'three',
 'years',
 'climate',
 'change',
 'before',
 'it’s',
 'late',
 'urlweb',
 'urlweb…']

In [10]:
stemmer = SnowballStemmer('english')

def mbti_stemmer(words, stemmer):
    return [stemmer.stem(word) for word in words]

df['stemmed_message'] = df['tokenised_message'].apply(mbti_stemmer, args=(stemmer, ))

for i, t in enumerate(df.iloc[15]['tokenised_message']):    
    print ('{:20s} --> {:10s}'.format(t, df.iloc[15]['stemmed_message'][i]))

glblctzn             --> glblctzn  
dont                 --> dont      
wan                  --> wan       
na                   --> na        
live                 --> live      
forever              --> forev     
nothing              --> noth      
will                 --> will      
because              --> becaus    
climate              --> climat    
change               --> chang     
����️��              --> ����️��   
taylorswift13        --> taylorswift13
zaynmalik            --> zaynmalik 
urlweb               --> urlweb    


In [11]:
lemmatizer = WordNetLemmatizer()

def mbti_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words]  

df['lemma_message'] = df['tokenised_message'].apply(mbti_lemma, args=(lemmatizer, ))  

for i, t in enumerate(df.iloc[1]['tokenised_message']):    
    print ('{:20s} --> {:10s}'.format(t, df.iloc[1]['lemma_message'][i]))

like                 --> like      
lack                 --> lack      
evidence             --> evidence  
anthropogenic        --> anthropogenic
global               --> global    
warming              --> warming   


In [12]:
TFID = TfidfVectorizer(stop_words='english', 
                             min_df=2, 
                             max_df=0.90, 
                             ngram_range=(1, 3))

In [13]:
X = df['newmessage']
y = df['sentiment']

###test
X_real = test['newmessage']

In [14]:
X_vec_t = TFID.fit_transform(X)

###real
X_vec_t_real = TFID.transform(X_real)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_vec_t, y, test_size=0.20, random_state=32, stratify=y)

In [16]:
### Linear SVC
lsvc = LinearSVC()
# Fit model to training data
lsvc.fit(X_train, y_train)
# Use trained model to run prediction on validation data
lsvc_pred= lsvc.predict(X_test)
print("Linear SVC Metrics")
print(metrics.classification_report(y_test, lsvc_pred))

Linear SVC Metrics
              precision    recall  f1-score   support

          -1       0.76      0.51      0.61       259
           0       0.56      0.41      0.47       471
           1       0.76      0.86      0.81      1706
           2       0.75      0.75      0.75       728

    accuracy                           0.74      3164
   macro avg       0.71      0.63      0.66      3164
weighted avg       0.73      0.74      0.73      3164



In [17]:
lsvc_pred_real= lsvc.predict(X_vec_t_real)

In [18]:
test['sentiment'] = lsvc_pred_real

In [19]:
test.head(25)

Unnamed: 0,message,tweetid,newmessage,sentiment
0,Europe will now be looking to China to make su...,169760,europe will looking china make sure that alone...,1
1,Combine this with the polling of staffers re c...,35326,combine this with polling staffers climate cha...,1
2,The scary unimpeachable evidence that climate ...,224985,scary unimpeachable evidence that climate chan...,1
3,Karoli morgfair OsborneInk dailykos \nPutin go...,476263,karoli morgfair osborneink dailykos putin jill...,1
4,RT FakeWillMoore Female orgasms cause global w...,872928,fakewillmoore female orgasms cause global warm...,0
5,RT nycjim Trump muzzles employees of several g...,75639,nycjim trump muzzles employees several gov’t a...,1
6,bmastenbrook yes wrote that in 3rd yr Comp Sci...,211536,bmastenbrook wrote that comp ethics part told ...,1
7,RT climatehawk1 Indonesian farmers weather cli...,569434,climatehawk1 indonesian farmers weather climat...,1
8,RT guardian British scientists face a ‘huge hi...,315368,guardian british scientists face ‘huge hit’ cu...,2
9,Aid For Agriculture Sustainable agriculture a...,591733,agriculture sustainable agriculture climate ch...,1


In [20]:
test[['tweetid','sentiment']].to_csv('Team16.csv', index=False)