## Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.metrics import f1_score

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Load in your data from kaggle.  
By working in a kaggle kernel, you can access the data directly from the competition, as well as make your submission without downloading your output file

In [2]:
train = pd.read_csv('../input/climate-change-edsa2020-21/train.csv')
test = pd.read_csv('../input/climate-change-edsa2020-21/test.csv')

In [3]:
train.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

## Splitting out the X variable from the target

In [4]:
y = train['sentiment']
X = train['message']

In [5]:
from nltk.stem import PorterStemmer

# init stemmer
porter_stemmer=PorterStemmer()

def my_cool_preprocessor(text):
    
    text=text.lower() 
    text=re.sub("\\W"," ",text) # remove special chars
    text=re.sub("\\s+(in|the|all|for|and|on)\\s+"," _connector_ ",text) # normalize certain words
    
    # stem words
    words=re.split("\\s+",text)
    stemmed_words=[porter_stemmer.stem(word=word) for word in words]
    return ' '.join(stemmed_words)

In [6]:
def my_tokenizer(text):
    # create a space between special characters 
    text=re.sub("(\\W)"," \\1 ",text)

    # split based on whitespace
    return re.split("\\s+",text)

## Turning text into something your model can read

In [7]:
vectorizer = TfidfVectorizer(ngram_range=(1,2),tokenizer=my_tokenizer, min_df=2,stop_words=["all","in","the","is","and"])
X_vectorized = vectorizer.fit_transform(X)
, 
#stop_words="english",max_df=0.85, preprocessor=my_cool_preprocessor,

''

In [8]:
vectorizer.vocabulary_

{'epa': 15093,
 'chief': 11392,
 'doesn': 14143,
 "'": 541,
 't': 31811,
 'think': 32868,
 'carbon': 10328,
 'dioxide': 13840,
 'main': 22129,
 'cause': 10470,
 'of': 24507,
 'global': 17312,
 'warming': 35610,
 '.': 1906,
 'wait': 35496,
 ',': 1034,
 'what': 36362,
 '!': 16,
 '?': 4285,
 'https': 19049,
 ':': 3215,
 '/': 2386,
 'co': 11937,
 'via': 35288,
 '@': 4411,
 'mashable': 22394,
 'epa chief': 15112,
 'chief doesn': 11399,
 "doesn '": 14144,
 "' t": 821,
 't think': 31939,
 'think carbon': 32879,
 'carbon dioxide': 10333,
 'dioxide main': 13849,
 'main cause': 22130,
 'cause of': 10482,
 'of global': 24646,
 'global warming': 17346,
 'warming .': 35619,
 '. .': 1917,
 '. wait': 2342,
 'wait ,': 35497,
 ', what': 1593,
 'what !': 36363,
 '! ?': 28,
 '? https': 4335,
 'https :': 19050,
 ': /': 3226,
 '/ /': 2390,
 '/ t': 2581,
 't .': 31815,
 '. co': 2000,
 'co /': 11941,
 'via @': 35291,
 '@ mashable': 5199,
 'it': 20140,
 's': 28875,
 'not': 24049,
 'like': 21620,
 'we': 36076,

## Splitting the training data into a training and validation set

In [9]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=0.20,shuffle=True, stratify=y, random_state=25)

## Training the model and evaluating using the validation set 

In [10]:
#rfc = RandomForestClassifier()
#rfc.fit(X_train, y_train)
#rfc_pred = rfc.predict(X_val)

In [11]:
lsvc = LinearSVC()
lsvc.fit(X_train, y_train)
lsvc_pred = lsvc.predict(X_val)

## Checking the performance of our model on the validation set

In [12]:
f1_score(y_val, lsvc_pred, average="macro")

0.6749640876116144

In [13]:
from sklearn import metrics

print(metrics.classification_report(y_val, lsvc_pred))

              precision    recall  f1-score   support

          -1       0.75      0.46      0.57       259
           0       0.62      0.42      0.50       471
           1       0.78      0.88      0.83      1706
           2       0.78      0.83      0.81       728

    accuracy                           0.77      3164
   macro avg       0.73      0.65      0.67      3164
weighted avg       0.76      0.77      0.75      3164



## Getting our test set ready 

In [14]:
testx = test['message']
test_vect = vectorizer.transform(testx)

## Making predictions on the test set and adding a sentiment column to our original test df

In [15]:
y_pred = lsvc.predict(test_vect)

In [16]:
test['sentiment'] = y_pred

In [17]:
test.head()

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,2
1,Combine this with the polling of staffers re c...,35326,1
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0


## Creating an output csv for submission

In [18]:
test[['tweetid','sentiment']].to_csv('testsubmission_8.csv', index=False)