In [1]:
# Libraries for data loading, data manipulation and data visulisation
import nltk
import re
import csv
import string
from PIL import Image
from wordcloud import WordCloud
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import numpy as np
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

# Downloads
#nlp = spacy.load('en')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Libraries for data preparation and model building
# Preprocessing
from collections import Counter
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk import SnowballStemmer, PorterStemmer, LancasterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords, wordnet  
from sklearn.feature_extraction.text import CountVectorizer   
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import resample

from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report, accuracy_score


# Setting global constants to ensure notebook results are reproducible
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
sns.set(rc={'figure.figsize':(12,8)})

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test_with_no_labels.csv')

FileNotFoundError: ignored

In [None]:
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
df['message'] = df['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)


###test
test['message'] = test['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [None]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [None]:
def remove_punctuation(post):
    return ''.join([l for l in post if l not in string.punctuation])

df['message'] = df['message'].apply(remove_punctuation)



###test
test['message'] = test['message'].apply(remove_punctuation)

In [None]:
# Remove all words below 3 characters
df['newmessage'] = df['message'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
test['newmessage'] =test['message'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [None]:
df.head()

Unnamed: 0,sentiment,message,tweetid,newmessage
0,1,PolySciMajor EPA chief doesnt think carbon dio...,625221,PolySciMajor chief doesnt think carbon dioxide...
1,1,Its not like we lack evidence of anthropogenic...,126103,like lack evidence anthropogenic global warming
2,2,RT RawStory Researchers say we have three year...,698562,RawStory Researchers have three years climate ...
3,1,TodayinMaker WIRED 2016 was a pivotal year in...,573736,TodayinMaker WIRED 2016 pivotal year climate c...
4,1,RT SoyNovioDeTodas Its 2016 and a racist sexis...,466954,SoyNovioDeTodas 2016 racist sexist climate cha...


In [None]:
df['newmessage'] = df['newmessage'].str.lower()
test['newmessage'] = test['newmessage'].str.lower()

In [None]:
tokeniser = TreebankWordTokenizer()
df['tokenised_message'] = df['newmessage'].apply(tokeniser.tokenize)
df['tokenised_message'].iloc[2]

['rawstory',
 'researchers',
 'have',
 'three',
 'years',
 'climate',
 'change',
 'before',
 'it’s',
 'late',
 'urlweb',
 'urlweb…']

In [None]:
stemmer = SnowballStemmer('english')

def mbti_stemmer(words, stemmer):
    return [stemmer.stem(word) for word in words]

df['stemmed_message'] = df['tokenised_message'].apply(mbti_stemmer, args=(stemmer, ))

for i, t in enumerate(df.iloc[15]['tokenised_message']):    
    print ('{:20s} --> {:10s}'.format(t, df.iloc[15]['stemmed_message'][i]))

glblctzn             --> glblctzn  
dont                 --> dont      
wan                  --> wan       
na                   --> na        
live                 --> live      
forever              --> forev     
nothing              --> noth      
will                 --> will      
because              --> becaus    
climate              --> climat    
change               --> chang     
����️��              --> ����️��   
taylorswift13        --> taylorswift13
zaynmalik            --> zaynmalik 
urlweb               --> urlweb    


In [None]:
lemmatizer = WordNetLemmatizer()

def mbti_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words]  

df['lemma_message'] = df['tokenised_message'].apply(mbti_lemma, args=(lemmatizer, ))  

for i, t in enumerate(df.iloc[1]['tokenised_message']):    
    print ('{:20s} --> {:10s}'.format(t, df.iloc[1]['lemma_message'][i]))

like                 --> like      
lack                 --> lack      
evidence             --> evidence  
anthropogenic        --> anthropogenic
global               --> global    
warming              --> warming   


In [None]:
TFID = TfidfVectorizer(stop_words='english', 
                             min_df=2, 
                             max_df=0.90, 
                             ngram_range=(1, 3))

In [None]:
X = df['newmessage']
y = df['sentiment']

###test
X_real = test['newmessage']

In [None]:
X_vec_t = TFID.fit_transform(X)

###real
X_vec_t_real = TFID.transform(X_real)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_vec_t, y, test_size=0.20, random_state=32, stratify=y)

In [None]:
### Linear SVC
lsvc = LinearSVC()
# Fit model to training data
lsvc.fit(X_train, y_train)
# Use trained model to run prediction on validation data
lsvc_pred= lsvc.predict(X_test)
print("Linear SVC Metrics")
print(metrics.classification_report(y_test, lsvc_pred))

Linear SVC Metrics
              precision    recall  f1-score   support

          -1       0.76      0.51      0.61       259
           0       0.56      0.41      0.47       471
           1       0.76      0.86      0.81      1706
           2       0.75      0.75      0.75       728

    accuracy                           0.74      3164
   macro avg       0.71      0.63      0.66      3164
weighted avg       0.73      0.74      0.73      3164



In [None]:
lsvc_pred_real= lsvc.predict(X_vec_t_real)

In [None]:
test['sentiment'] = lsvc_pred_real

In [None]:
test.head(25)

Unnamed: 0,message,tweetid,newmessage,sentiment
0,Europe will now be looking to China to make su...,169760,europe will looking china make sure that alone...,1
1,Combine this with the polling of staffers re c...,35326,combine this with polling staffers climate cha...,1
2,The scary unimpeachable evidence that climate ...,224985,scary unimpeachable evidence that climate chan...,1
3,Karoli morgfair OsborneInk dailykos \nPutin go...,476263,karoli morgfair osborneink dailykos putin jill...,1
4,RT FakeWillMoore Female orgasms cause global w...,872928,fakewillmoore female orgasms cause global warm...,0
5,RT nycjim Trump muzzles employees of several g...,75639,nycjim trump muzzles employees several gov’t a...,1
6,bmastenbrook yes wrote that in 3rd yr Comp Sci...,211536,bmastenbrook wrote that comp ethics part told ...,1
7,RT climatehawk1 Indonesian farmers weather cli...,569434,climatehawk1 indonesian farmers weather climat...,1
8,RT guardian British scientists face a ‘huge hi...,315368,guardian british scientists face ‘huge hit’ cu...,2
9,Aid For Agriculture Sustainable agriculture a...,591733,agriculture sustainable agriculture climate ch...,1


In [None]:
test[['tweetid','sentiment']].to_csv('Team16.csv', index=False)