In [33]:
import pandas as pd
import re
import string

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

In [35]:
tweet_df_location = pd.read_csv('../data/graphAnalysis/clean_climateTwitterData.csv')

In [36]:
tweet_df_location.columns

Index(['Unnamed: 0', 'id', 'author_id', 'text', 'retweets', 'permalink',
       'date', 'formatted_date', 'favorites', 'mentions', 'hashtags', 'geo',
       'urls', 'search_hashtags', 'location', 'text_clean',
       'tb_sentiment_polarity', 'tb_sentiment_subjectivity',
       'textBlob_sentiment', 'vader_compound', 'vader_pos', 'vader_neg',
       'vader_neu', 'V_Sentiment'],
      dtype='object')

In [37]:
#Helper function for Tokenization

def tokenize_only(in_string):
    """
    Convert `in_string` of text to a list of tokens using NLTK's TweetTokenizer
    """
    # reasonable, but adjustable tokenizer settings
    tokenizer = TweetTokenizer(preserve_case=False,
                               reduce_len=True,
                               strip_handles=False)
    tokens = tokenizer.tokenize(in_string)
    return tokens

In [38]:


# List of stopwords
stop_words= stopwords.words('english') #import stopwords from NLTK package
readInStopwords = pd.read_csv("pre_process/twitterStopWords.csv", encoding='ISO-8859-1') # import stopwords from CSV file as pandas data frame
readInStopwords = readInStopwords.wordList.tolist() # convert pandas data frame to a list

readInStopwords.append('http')
readInStopwords.append('https')

search_terms = ['#climateStrike','#climatestrike','#climatechange','#GreenNewDeal','#climatecrisis','#climateAction','#FridaysForFuture',
            '#environment','#globalwarming','#GlobalWarming','#ActOnClimate','#sustainability','#savetheplanet',
        '#bushfiresAustralia','#bushfires']

readInStopwords.extend(search_terms)
stop_list = stop_words + readInStopwords # combine two lists i.e. NLTK stop words and CSV stopwords
stop_list = list(set(stop_list)) # strore only unique values 
    
print(stop_list)

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
X = tweet_df_location['text_clean']
y = tweet_df_location['search_hashtags']

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=0.00, stop_words=stop_list, tokenizer=tokenize_only) # Use tf (raw term count) features for LDA.
#X_Vect = tf_vectorizer.fit_transform(X)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)


In [45]:
text_clf = Pipeline([
    ('tfidf',TfidfVectorizer(max_df=0.9, min_df=0.00, stop_words=stop_list, tokenizer=tokenize_only)),
    ('clf',LinearSVC(C=0.01))])

In [46]:
text_clf.fit(X_train,y_train)

TypeError: 'function' object is not iterable

In [20]:
predictions = text_clf.predict(X_test)

In [21]:
from sklearn.metrics import confusion_matrix, classification_report

In [22]:
print(confusion_matrix(y_test,predictions))

[[ 399    1   11   55   28   28   13    1    1   21    0   10]
 [   0  207    0   55    3    0    2    0    3    0    0    0]
 [   8    0 1218  253   57  250   30    3   26   34    4   30]
 [   9    2   32 4356   18  217   28    2   94   39    3   57]
 [   6    8   24  193 1046  109   18    0   28   51    2   10]
 [   5    4   10   91   10 5294   23    7   19   23    4   17]
 [   0    0    5   96    3    8 1202    0   20    3    1   73]
 [   5    4   33   56    9  353   15  418    5    6    4    3]
 [   0    1    1  105    0   17   24    1 1079    5    0   13]
 [   0    0    3   56    0   44    1    1    5 1259    1    7]
 [   0    0    3   20    3   12   25    0   16    2  334   15]
 [   0    0    0   21    2    5   11    0   16    1    1 1680]]


In [23]:
print(classification_report(y_test,predictions))

                   precision    recall  f1-score   support

    #actonclimate       0.92      0.70      0.80       568
       #bushfires       0.91      0.77      0.83       270
   #climateaction       0.91      0.64      0.75      1913
   #climatechange       0.81      0.90      0.85      4857
   #climatecrisis       0.89      0.70      0.78      1495
   #climatestrike       0.84      0.96      0.89      5507
     #environment       0.86      0.85      0.86      1411
#fridaysforfuture       0.97      0.46      0.62       911
   #globalwarming       0.82      0.87      0.84      1246
    #greennewdeal       0.87      0.91      0.89      1377
   #savetheplanet       0.94      0.78      0.85       430
  #sustainability       0.88      0.97      0.92      1737

         accuracy                           0.85     21722
        macro avg       0.89      0.79      0.82     21722
     weighted avg       0.86      0.85      0.85     21722



In [24]:
from sklearn import metrics

In [25]:
metrics.accuracy_score(y_test,predictions)

0.8513028266273823

Accuracy is --- 0.8101003590829574 
Accuracy changed to point 0.85 after tuning the LinearSVC model.

In [26]:
# Save the model
joblib.dump(text_clf, 'twitterLinearSVCModel.pkl') 

['twitterLinearSVCModel.pkl']

In [32]:
text_clf.predict(["Cuyahoga county eliminates use of plastic bags",
                  "Weather is still warm in winters and is not freezing",
                  "Actions are more important for a greater cause",
                  "Its freezing out today may be need to drive my SUV more"
                 ])

array(['#environment', '#climatechange', '#climatechange',
       '#climatestrike'], dtype=object)

In [29]:
text_clf.predict(["Too much rain in moonsoon"])[0]

'#climatechange'

In [30]:
text_clf.predict(["Schools strikes in sweden"])[0]

'#climatestrike'

In [31]:
text_clf.predict(["buy a tesla save the earth"])[0]

'#climatechange'