## Political leaning classification with SVM

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import scipy.sparse as sp
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split 
from sklearn import metrics

### Keywords extracted

Keywords extracted from ETS and FAR (sx)

In [7]:
with open('/content/gdrive/My Drive/Tesi/LDA2/Related/final_keywordsx.pickle', 'rb') as data:
  KEYWORD_SX= pickle.load(data) 
len(KEYWORD_SX)

433

In [8]:
print(KEYWORD_SX)

{'synagogue_shooting', 'history_trump', 'right_wall', 'fuck_video', 'trump_base', 'migrant_caravan', 'trump_pardon', 'propaganda_machine', 'school_talk', 'fuck_nazi', 'campaign_finance', 'official_say', 'remember_republican', 'get_shut', 'help_keep', 'pittsburgh_synagogue', 'archive_brutalist', 'support_pajama', 'proud_boy', 'attendee_elizabeth', 'former_trump', 'trump_attack', 'show_youtube', 'nazi_salute', 'tantrum_protest', 'report_white', 'great_love', 'mueller_team', 'button_idiot', 'domestic_terrorism', 'ban_get', 'defense_secretary', 'wilson_respond', 'trump_lose', 'tower_moscow', 'lawyer_trump', 'rightward_turn', 'dude_kill', 'anti_fascist', 'attack_attendee', 'synagogue_shooter', 'legal_team', 'wizard_illusion', 'rep_steve', 'feminine_wizard', 'trump_approval', 'tucker_viewer', 'nazi_germany', 'want_trump', 'judiciary_committee', 'original_post', 'million_dollar', 'ron_desantis', 'trump_response', 'right_activist', 'right_rally', 'david_pakman', 'fuck_right', 'first_year', 'ri

Keywords extracted from T_D (dx)

In [10]:
with open('/content/gdrive/My Drive/Tesi/LDA2/Related/final_keyworddx.pickle', 'rb') as data:
  KEYWORD_DX= pickle.load(data)
len(KEYWORD_DX)

769

In [11]:
print(KEYWORD_DX)



In [16]:
print(len(KEYWORD_DX.intersection(KEYWORD_SX))) #Intersection has been already dropped

0


Merging keywords

In [17]:
with open('/content/gdrive/My Drive/Tesi/LDA2/Related/all_keyword.pickle', 'rb') as handle:
    tot_keyword = pickle.load(handle)
len(tot_keyword)

1202

In [0]:
tot_keyword=list(tot_keyword)

### Data preparation for SVM

Picking data used to extract keywords to make the training dataset for SVM  (here an example with data from FAR)

In [0]:
with open('/content/gdrive/My Drive/Tesi/LDA2/Related/bigram_FAR.pickle', 'rb') as handle:
    bigramTOT_FAR = pickle.load(handle)

In [0]:
post_FAR=list()
for elem in bigramTOT_FAR:
  post_FAR.append([' '.join(elem)])

In [24]:
post_FAR[:5]

[['leader_allege allege_producer producer_pay pay_fake fake_scene scene_cancel cancel_amp amp_series series_realize realize_reality reality_tv tv_work'],
 ['happy_new new_year year_may may_dream dream_come come_true true_xoxo'],
 ['extremist_website website_insist insist_armed armed_march march_jewish jewish_people people_montana montana_go'],
 ['picture_proof proof_supremacist supremacist_want want_peace'],
 ['wreck_woman woman_destroy destroy_civilization civilization_response response_black black_pigeon pigeon_speak']]

Building for every post an occurence vector. The length of the vectors is equal to the number of keywords

In [25]:
# list of text documents
res=list()
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(tot_keyword)
# summarize
print(vectorizer.vocabulary_) 
print(vectorizer.get_feature_names())
# encode document
for elem in post_FAR:
  vector = vectorizer.transform(elem)
# summarize encoded vector
  res.append(list(vector.toarray()[0]))



In [26]:
#Counting how many vectores composed only by zero exits (I mean post without any of the keywords selected)
tot=0
for elem in res:
  cnt=0
  for i in range(len(elem)):
    if elem[i] == 0:
      cnt+=1
  if cnt==len(elem):
    tot+=1
print(tot)

25577


Labelling training data

In [0]:
with open('/content/gdrive/My Drive/Tesi/LDA2/Related/vectors_FAR.pickle', 'rb') as handle:
    vectors_FAR = pickle.load(handle)

In [0]:
FAR_training=pd.DataFrame(vectors_FAR[:5000])

In [31]:
FAR_training.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,1162,1163,1164,1165,1166,1167,1168,1169,1170,1171,1172,1173,1174,1175,1176,1177,1178,1179,1180,1181,1182,1183,1184,1185,1186,1187,1188,1189,1190,1191,1192,1193,1194,1195,1196,1197,1198,1199,1200,1201
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
tot_keyword.sort()
FAR_training.columns=tot_keyword

In [0]:
FAR_training['pol_leaning']=1

## SVM

In [0]:
with open('/content/gdrive/My Drive/Tesi/LDA2/Related/input_SVM.pickle', 'rb') as handle:
    training_data = pickle.load(handle) 

In [54]:
training_data.head()

Unnamed: 0,abc_news,accord_definition,action_president,active_supporter,ad_youtube,administration_official,admit_villain,advise_trump,africa_look,afternoon_magathread,air_force,alexandria_ocasio,alt_righter,amendment_right,america_first,america_great,american_citizen,american_democracy,american_flag,american_history,american_include,amp_local,amp_nbsp,andrew_anglin,andrew_mccabe,angela_merkel,announce_intent,anonymous_source,answer_question,anthony_weiner,anti_fascist,anti_gun,anti_immigration,anti_muslim,anti_semitic,anti_semitism,anyone_explain,anyone_feel,anyone_find,anyone_get,...,weird_politic,west_virginia,west_wing,western_civilization,white_guy,white_lady,white_male,white_nationalism,white_nationalist,white_power,white_privilege,white_woman,whole_thing,wilson_respond,win_special,wing_commit,wing_extremist,wish_could,wizard_illusion,word_rashida,word_text,work_trump,world_leader,would_appreciate,would_fight,would_get,would_go,would_happen,would_like,would_love,would_make,would_need,would_say,would_take,wtf_go,year_prison,york_city,youtube_ban,youtube_recommend,pol_leaning
35581,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
38983,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
8932,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
35365,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
6305,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
X=training_data.loc[:, :'youtube_recommend'] #training features
y=training_data['pol_leaning'] #labels

In [0]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=109) # 70% training and 30% test

SVM with linear kernel

In [0]:
clf = svm.SVC(kernel='linear')

In [0]:
#Train the model using the training sets
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [0]:
#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [0]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.6085925925925926


Precision: 0.7220916568742656
Recall: 0.203476821192053


Grid search with rbf kernel


In [0]:
clf = svm.SVC()

In [0]:
param_dist = {
    'kernel' : ['rbf'],
    'gamma': ['auto'],
    'C' : [0.001, 0.01, 0.1, 1, 10]
}
search = GridSearchCV(clf, param_grid=param_dist, n_jobs=1, cv=3)

In [0]:
search.fit(X_train, y_train)

In [0]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [0]:
report(search.cv_results_)

Model with rank: 1
Mean validation score: 0.559 (std: 0.000)
Parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}

Model with rank: 2
Mean validation score: 0.555 (std: 0.000)
Parameters: {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}

Model with rank: 3
Mean validation score: 0.555 (std: 0.000)
Parameters: {'C': 0.001, 'gamma': 'auto', 'kernel': 'rbf'}

Model with rank: 3
Mean validation score: 0.555 (std: 0.000)
Parameters: {'C': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}

Model with rank: 3
Mean validation score: 0.555 (std: 0.000)
Parameters: {'C': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}

