## About this code

This script apply entire pipeline to new batch of tweets:<br>
1. Pre-process
2. Feature engineering step-1 (low computation)
3. Relevance classification (0/1-irrelevant)
4. Filter non relevant tweets
5. Feature engineering step-2 (high computation)
6. Support classification (-1/0/1)

## Run configurations

<div class="alert alert-block alert-success">
'min_couple_appearances': 0<br>
'feature_selection': 'False', # When predicting new data - no auto features selection is applied.<br> 
                              #Specific features are selected by model needs.
</div>

In [None]:
config = {
            # Mode
            'learning': False,           # True at the train phase, False when operational - predict new data using existing model
            'distributed_pipeline': False,
        
            # Data
            'filter_taggers': False,    # If to filter labeled data by the tagger - Yogev, Ofir and Itay
            'target': 'relevance',        # The target of the algorithm
            'load_df_from_pickle': False,# If to load a complete data-frame or to execute the entire pipeline from the beginning 
            
            # Pre-process
            'slang': True,              # If to handle slang words, also for Feature engineering
            'spell_correction': False,  # If to use spell correction in the pre-proccess phase 
            'col': 'text',              # Column to apply pre-proccess on - used by the proccess_tweet function in tweet_pre_proccess.py
            
            # Feature engineering
            'nlp_features': True,
            'word_type': False,
            'dominant_keywords': True,
            'dominant_keywords_metric': 'smart_error', # entropy, purity, smart_error
            'user_features': False,
            'user_bio': False,
            'time_and_event': False,    # Time and event features
            'twitter_foundation_date': '2006-03-21 12:00:00',
            'network_features': False,
            'load_network_data': False,
            'nlp_raw': True,
            'emotion': False,
            'hashtags_and_mentions': True,
            'num_dominant_words': 100,  # Number of dominant words to use in the NLP features (dominant_keywords function - parameter k)
            'min_word_appearances': 4,# Min number of appearnces (#tweets) for dominant word to be considered dominant
            'min_couple_appearances': 4,# For couple of dominant words -
                                        # how many time the combination need to appear in the data in order to become a feature        
            'url_features': False,
            'Tweets_media_and_content': True,
            'country_support': False,
            'entities_features': True,
            'sentiment': False,          # NLTK sentiment feature
    
            # Feature selection
            'feature_selection': False,
            'remove_features_zero_variance': False,        
            'remove_correlated_features': False,
            'remove_low_correlated_features': False,
            'feature_importance': False,
            'corr_per_thresh': 0.0,    # Percentile threshold of Min correlation between a feature and the target variable (abs) 
            'importance_per_thresh': 33,# Percentile threshold of Min importance of a feature
            'PCA': False,               # If the use PCA to reduce dimensiality
            'PCA_var': 0.995,           # % of commulative explained variance required from PCA (affect #of PC)

            # Model
            'model': 'Random Forest',
            'load_model_pickle': False, # If to import existing models from pickle
            'regression': False,        # If to run regression model also
            'validation': False,        # If to use train split to train and validation
            'class_threshold': 0.43,     # Probability threshold in order to classify a tweet as negative/neurtal/positive
            'bench_from_pickle': True
    }

Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import seaborn as sns
import os
import glob
import csv
import time
import matplotlib
import pickle
from tqdm import tqdm, trange, tqdm_notebook, tqdm_pandas
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import sklearn.metrics as metrics

In [None]:
# Avoid trimming text in jupyter preview
pd.set_option('display.max_colwidth', -1)

In [None]:
plt.style.use('ggplot')
%matplotlib inline

In [None]:
np.random.seed = 1234
tqdm.pandas(tqdm_notebook())

#### Import thesis modules

In [None]:
import domain_entities
import nlp_url_features
import tweet_pre_proccess
import nlp_features
import eda
import ml_model
import slang
import pipeline

In [None]:
os.chdir('../relevance algo')
import relevance_prediction
#import political_prediction
os.chdir('../support algo')
import support_prediction

## Data import

In [None]:
os.chdir('../relevance algo')

In [None]:
data = pd.read_pickle('../../data/quoted_source_emotion.pickle')

In [None]:
pd.set_option('display.max_columns', None)
#data.head(n=5)

## 1. Relevance Algo

<div class="alert alert-block alert-info">
df object explanation:<br>
    - df_relevance_no_filter_no_prediction = no filtered features, no prediction<br>
    - df_relevance_no_filter_w_prediction.tsv = all features + relevance prediction<br>
    - data_w_relevance = selected features for relevance algo + no prediction
</div>

#### Run prediction

In [None]:
df, full_df = relevance_prediction.predict(data, config)

In [None]:
# Summarize results
plt.figure(num=None, figsize=(10, 6), facecolor='w', edgecolor='k')
h = sns.countplot(x=config['target'], data=df, order = df[config['target']].value_counts().index)

##### Adding relevance feature to original data

In [None]:
data['relevance'] = df['relevance'].values
# data.to_csv('data_w_relevance.tsv', header=True, sep='\t')

#### Remove irrelevants tweets

In [None]:
# Filter irrelevant tweets using the relevance label/prediction
full_df = relevance_prediction.remove_irrelevants(df, full_df)
data = data[data.relevance!=2].reset_index(drop=True)

## 2. Support Algo

#### Change configuretion

In [None]:
# Change configuraion input
config['target'] = 'support'
config['sentiment'] = True
config['emotion'] = True
config['country_support'] = True
config['word_type'] = True
config['importance_per_thresh'] = 60
config['num_dominant_words'] = 250
config['user_features'] = True
config['user_bio'] = True
config['time_and_event'] = True    

#### Adding support features

In [None]:
full_df = support_prediction.adding_features(full_df, config)[0]

#### Run prediction

In [None]:
# Output to 'support_df' - the data frame for the support algorithm (filtered features + support prediction)
df, full_df = support_prediction.predict(full_df, config)

In [None]:
# Summarize results
plt.figure(num=None, figsize=(10, 6), facecolor='w', edgecolor='k')
h = sns.countplot(x=config['target'], data=df, order = df[config['target']].value_counts().index)

##### Adding support feature to original data

In [None]:
data['support'] = df['support'].values

### Export results

In [None]:
data.to_csv('../pipeline/predictions/data_w_predictions_{}.tsv'.format(str(datetime.today()).split(' ')[0]), header=True, sep='\t')