In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib




In [3]:
tweet_df_location = pd.read_csv('../data/graphAnalysis/clean_climateTwitterData_Jan20.csv')

In [4]:
tweet_df_location.columns

Index(['Unnamed: 0', 'id', 'author_id', 'text', 'retweets', 'permalink',
       'date', 'formatted_date', 'favorites', 'mentions', 'hashtags', 'geo',
       'urls', 'search_hashtags', 'location', 'sentiment_polarity',
       'sentiment_subjectivity', 'textBlob_sentiment', 'vader_compound',
       'vader_pos', 'vader_neg', 'vader_neu', 'V_Sentiment'],
      dtype='object')

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X = tweet_df_location['text']
y = tweet_df_location['search_hashtags']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)


In [8]:
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

In [9]:
text_clf.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [10]:
predictions = text_clf.predict(X_test)

In [11]:
from sklearn.metrics import confusion_matrix, classification_report

In [12]:
print(confusion_matrix(y_test,predictions))

[[ 420    1   17   38   27   30    5    5    1   17    0    7]
 [   0  224    2   31    6    1    2    1    3    0    0    0]
 [  14    3 1216  219   61  246   26   39   26   30    7   26]
 [  38   17  219 3821  139  246   81   29  132   62   23   50]
 [  16    9   75  156 1015  119   14   11   27   42    4    7]
 [  15    5   88  132   54 5009   15  101   26   39    9   14]
 [   9    0   13   66   11   14 1186    1   27    4   10   70]
 [   3    3   35   30   11  321    1  483    6    6    6    6]
 [   2    0    4   67   17   33   18    1 1089    5    5    5]
 [  13    0   12   49   27   40    6    7    6 1205    6    6]
 [   1    0    0    9    5    7   22    1   14    0  358   13]
 [   4    0    9   39    8    8   59    5   19    3   12 1571]]


In [13]:
print(classification_report(y_test,predictions))

                   precision    recall  f1-score   support

    #actonclimate       0.79      0.74      0.76       568
       #bushfires       0.85      0.83      0.84       270
   #climateaction       0.72      0.64      0.67      1913
   #climatechange       0.82      0.79      0.80      4857
   #climatecrisis       0.73      0.68      0.71      1495
   #climatestrike       0.82      0.91      0.87      5507
     #environment       0.83      0.84      0.83      1411
#fridaysforfuture       0.71      0.53      0.61       911
   #globalwarming       0.79      0.87      0.83      1246
    #greennewdeal       0.85      0.88      0.86      1377
   #savetheplanet       0.81      0.83      0.82       430
  #sustainability       0.89      0.90      0.89      1737

         accuracy                           0.81     21722
        macro avg       0.80      0.79      0.79     21722
     weighted avg       0.81      0.81      0.81     21722



In [14]:
from sklearn import metrics

In [15]:
metrics.accuracy_score(y_test,predictions)

0.8101003590829574

Accuracy is --- 0.8101003590829574 

In [16]:
# Save the model
joblib.dump(text_clf, 'twitterLinearSVCModel.pkl') 

['twitterLinearSVCModel.pkl']

In [17]:
text_clf.predict(["This climate is getting warmer.."])

array(['#climatestrike'], dtype=object)

In [18]:
text_clf.predict(["People are getting aware of the surrounding "])

array(['#environment'], dtype=object)

In [19]:
text_clf.predict(["Actions are more important for a greater cause"])

array(['#climateaction'], dtype=object)

In [20]:
text_clf.predict(["Too much rain in moonsoon"])[0]

'#climatechange'

In [21]:
text_clf.predict(["Schools strikes in sweden"])[0]

'#climatestrike'

In [22]:
text_clf.predict(["buy a tesla save the earth"])[0]

'#environment'

In [23]:
#Its freezing out today may be need to drive my SUV more