<h2 style="color:blue" align="left"> 1. Import necessary Libraries </h2>

In [1]:
# Read Data
import numpy as np                     # Linear Algebra (calculate the mean and standard deviation)
import pandas as pd                    # manipulate data, data processing, load csv file I/O (e.g. pd.read_csv)

# Visualization
import seaborn as sns                  # Visualization using seaborn
import matplotlib.pyplot as plt        # Visualization using matplotlib
%matplotlib inline

# style
plt.style.use("fivethirtyeight")       # Set Graphs Background style using matplotlib
sns.set_style("darkgrid")              # Set Graphs Background style using seaborn

import warnings                        # Ignore Warnings
warnings.filterwarnings("ignore")

<h2 style="color:blue" align="left"> 2. Load data </h2>

### i) Dataset :1

In [2]:
df1 = pd.read_csv("IRAhandle_tweets_1.csv")
df2 = pd.read_csv("IRAhandle_tweets_2.csv")
df3 = pd.read_csv("IRAhandle_tweets_3.csv")
df4 = pd.read_csv("IRAhandle_tweets_4.csv")
df5 = pd.read_csv("IRAhandle_tweets_5.csv")
df6 = pd.read_csv("IRAhandle_tweets_6.csv")
df7 = pd.read_csv("IRAhandle_tweets_7.csv")
df8 = pd.read_csv("IRAhandle_tweets_8.csv")
df9 = pd.read_csv("IRAhandle_tweets_9.csv")

### a) Combine input datasets together

In [3]:
df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9], ignore_index=True)

### b) Filter only English tweets

In [4]:
df_english = df.loc[df['language']=='English']

In [5]:
df_english.shape

(2128963, 15)

In [6]:
df_english['language'].value_counts()

English    2128963
Name: language, dtype: int64

In [7]:
df_english['content'].isnull().sum()

1

In [8]:
df_english['content'] = df_english['content'].fillna(df_english['content'].mode()[0])

In [9]:
df_english['content'].isnull().sum()

0

In [10]:
df_content = pd.DataFrame(df_english['content'])
df_content.head()

Unnamed: 0,content
0,"""We have a sitting Democrat US Senator on tria..."
1,Marshawn Lynch arrives to game in anti-Trump s...
2,Daughter of fallen Navy Sailor delivers powerf...
3,JUST IN: President Trump dedicates Presidents ...
4,"19,000 RESPECTING our National Anthem! #StandF..."


In [11]:
df_content.shape

(2128963, 1)

In [12]:
temp1 = np.ones(len(df_content))

In [13]:
df_content['label'] = temp1
df_content.head()

Unnamed: 0,content,label
0,"""We have a sitting Democrat US Senator on tria...",1.0
1,Marshawn Lynch arrives to game in anti-Trump s...,1.0
2,Daughter of fallen Navy Sailor delivers powerf...,1.0
3,JUST IN: President Trump dedicates Presidents ...,1.0
4,"19,000 RESPECTING our National Anthem! #StandF...",1.0


In [14]:
df_content['label'] = df_content['label'].astype(int)

In [15]:
df_content.shape

(2128963, 2)

In [16]:
df_content_shuffle = df_content.take(np.random.permutation(len(df_content))[:10000])
df_content_shuffle.head()

Unnamed: 0,content,label
680628,Good morning from the capital of Israel https:...,1
861128,These 18 things you touch every day are full o...,1
1468938,Iceland has one of the largest gun ownership r...,1
542340,#marchforeurope should be #marchforEU or #Marc...,1
2371954,‘Russia didn’t let terrorists capture Damascus...,1


### i) Dataset :2

### a) Labels: Negative tweets

In [17]:
label = pd.read_csv("tweets-2016-10000-textonly.txt", delimiter='\t', names=['content'])

In [18]:
temp2 = np.zeros(len(label))
label['label'] = temp2

In [19]:
label['label'] = label['label'].astype(int)
label.head()

Unnamed: 0,content,label
0,Leaked records show Trump may have avoided tax...,0
1,Donald Trump vs. Hillary Clinton Debate Cold O...,0
2,"@realDonaldTrump The need is to blow off the ""...",0
3,US-Medien: Trump zahlte womöglich jahrelang ke...,0
4,#amjoy #cnn #msnbc #Trump has 2 words for #dep...,0


### iv) Final dataframe only with 'content'

In [20]:
tweets = pd.concat([df_content_shuffle, label], ignore_index=True)

In [21]:
tweets.head()

Unnamed: 0,content,label
0,Good morning from the capital of Israel https:...,1
1,These 18 things you touch every day are full o...,1
2,Iceland has one of the largest gun ownership r...,1
3,#marchforeurope should be #marchforEU or #Marc...,1
4,‘Russia didn’t let terrorists capture Damascus...,1


In [22]:
tweets = tweets.sample(frac = 1).reset_index(drop=True)
tweets.head(10)

Unnamed: 0,content,label
0,@realDonaldTrump is a petty man of low charact...,0
1,Huckabee SHUTS DOWN Trolling Reporter for Accu...,1
2,Opinion: What Trump will do at GOPdebate #Vega...,1
3,“Trek against Trump” urges voters to choose Hi...,0
4,"Bernie Sanders on NY Times Trump tax report: ""...",0
5,"'@Mashugana_Kat oh, you should watch ""Adam rui...",1
6,Is America Ready For Hillary Clinton's 'Sharia...,0
7,"“Trump thanked and blessed the crowd, pumped h...",0
8,CAUGHT ON TAPE Clinton: Dem 'revolution' appea...,0
9,ISIS on rise,1


In [23]:
tweets['content'] = tweets['content'].astype(str)

<h2 style="color:blue" align="left"> 4. Text Cleaning or Preprocessing </h2>

In [24]:
# library to clean the text 
import re  
  
# Natural Language Tool Kit 
import nltk  
  
# stopwords is a list of unwanted words like the,and,of,etc...
nltk.download('stopwords') 
  
# to remove stopword; corpus is a collection of text.
from nltk.corpus import stopwords 
  
# for Stemming propose  
# Stemming means taking the root of the word eg. loved, loving, will love -> love
# This will reduce different versions of the same word and will hence reduce the sparsity of matrix
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\deepusuresh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
# Initialize empty array to append clean text  
corpus = []

# creating PorterStemmer object to take main stem of each word
ps = PorterStemmer()

# 17708 (reviews) rows to clean 
for i in range(0, len(tweets)):         
    
    # remove html tags
    review = re.sub(r"http\S+","", tweets['content'][i])
    
    # remove special characters
    review = re.sub('[^a-zA-Z]+', ' ', review)
          
    # convert all cases to lower cases 
    review = review.lower()  
      
    # split to array(default delimiter is " ") 
    review = review.split()  
      
    # loop for stemming each word in string array at ith row     
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] 
    
    # rejoin all string array elements to create back into a string 
    review = ' '.join(review)   
      
    # append each string to create array of clean text  
    corpus.append(review)  

<h2 style="color:blue" align="left"> 5. Making the bag of words via sparse matrix </h2>

In [26]:
# Creating the Bag of Words model 
from sklearn.feature_extraction.text import CountVectorizer 
  
# To extract max 1800 feature. "max_features" is attribute to experiment with to get better results 
cv = CountVectorizer(max_features = 1800)  
  
# X contains corpus (dependent variable) 
X = cv.fit_transform(corpus).toarray()  
  
# y contains answers if review is positive or negative 
y = tweets.iloc[:, 1].values

In [27]:
# split  data into training and testing sets of 70:30 ratio
# 20% of test size selected
# random_state is random seed
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

<h2 style="color:blue" align="left"> 6. Model building and Evaluation </h2>

### a) Naive Bayes

In [28]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB()

### b) Random Forest

In [31]:
# Fitting Random Forest Classification to the Training set 
from sklearn.ensemble import RandomForestClassifier 
  
# n_estimators can be said as number of trees, experiment with n_estimators to get better results  
model = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 7) 
model.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', random_state=7)

### c) Decision Tree

In [32]:
from sklearn.tree import DecisionTreeClassifier
DCT = DecisionTreeClassifier()
DCT.fit(X_train, y_train)

DecisionTreeClassifier()

### d) Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train, y_train)

LogisticRegression()

### e) SVM

In [35]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)

SVC()

### f) XGBoost

In [34]:
import xgboost as xgb
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

-------------------

In [37]:
import pickle

In [40]:
# Creating a pickle file for all models
pickle.dump(classifier, open('Naive_Bayes_model.pkl', 'wb'))
pickle.dump(model, open('Random_Forest_model.pkl', 'wb'))
pickle.dump(DCT, open('Decision_Tree_model.pkl', 'wb'))
pickle.dump(LR, open('Logistic_Regression_model.pkl', 'wb'))
pickle.dump(svm, open('svm_model.pkl', 'wb'))
pickle.dump(xgb, open('XGBoost_model.pkl', 'wb'))
pickle.dump(cv, open('cv-transform.pkl', 'wb'))