# Test for Feature Extraction Script
**Applying tests on feature extraction functions**

**Moving to the current directory of the script**

In [1]:
cd "D:\University\FIT3162\Project\Fake-News-Detection\Feature Extraction"

D:\University\FIT3162\Project\Fake-News-Detection\Feature Extraction


**This class of feature extraction bought here to avoid importing each time**

In [1]:
class FeatureExtraction:
    """
    This class is used to build a pipeline for sentiment feature extraction using Vader
    """
    def __init__(self):
        self.post_file = "cleaned_df.csv"
        self.comment_file = "cleaned_comments.csv"
        self.post_df = None
        self.comment_df = None
        self.comment_scores = {}
    
    def read_datasets(self):
        self.post_df = pd.read_csv(self.post_file, index_col = 0, encoding = "ISO-8859-1")
        self.comment_df = pd.read_csv(self.comment_file, index_col=0)
    
    def print_statistics(self):
        print("Number of Posts", len(self.post_df))
        print("Number of Comments", len(self.comment_df))
        print("Number of Fake Posts", len(self.post_df.loc[self.post_df['2_way_label'] == 0]))
        print("Number of True Posts", len(self.post_df.loc[self.post_df['2_way_label'] == 1]))
        
    def get_sample_posts(self, sample_size):
        self.post_df = self.post_df.sample(sample_size, random_state = 123).reset_index(drop=True)
        
    def filter_comments(self):
        ids = self.post_df.id.unique()
        print(ids)
        self.comment_df = self.comment_df[self.comment_df['submission_id'].isin(ids)]
        self.comment_df = self.comment_df.reset_index(drop=True)
        
    def build_comment_score(self):
        #creating hashtables with post id as key
        for ind in self.post_df.index:
            # hastable for score of comments
            self.comment_scores[self.post_df['id'][ind]] = [0, 0]
        sentiment_vader.build_comment_dictionary(self.comment_df, self.comment_scores)
        
    def cmnt_sentiment_column(self):
        """
        Add the comment sentiment Column to Post dataset
        """
        temp = list(self.comment_scores.values())
        score = [x[0]/x[1] if x[1] > 0 else x[1] for x in temp]
        num_comments = [x[1] for x in temp]
        
        self.post_df["num_comments"] = num_comments
        self.post_df["comment_sentiment"] = score
            
    def post_sentiment_column(self):
        self.post_df['post_sentiment'] = self.post_df.apply(lambda x: sentiment_vader.post_sentiment(x['title']), axis=1)
        
    def build_pipeline(self):
        print("Step 1: Reading Dataset")
        self.read_datasets()
        print("Step 2: Filter Posts")
        self.get_sample_posts(6)
        print("Step 3: Filter Comments")
        self.filter_comments()
        print("Step 4: Building Comment Score Dictionary")
        self.build_comment_score()
        print("Step 6: Add Comment Score Column")
        self.cmnt_sentiment_column()
        print("Step 7: Add Post Score Column")
        self.post_sentiment_column()
        print("---DONE---")
        
    def get_post_dataset(self):
        return self.post_df

In [2]:
import unittest
import pytest
import pandas as pd
import numpy as np
import import_ipynb
import sentiment_vader
from pandas.testing import assert_frame_equal

In [3]:
feature = FeatureExtraction()
feature.read_datasets()

  mask |= (ar1 == a)


In [46]:
class TestFeatureExtraction(unittest.TestCase):
    
    def test_read_datasets(self):
        feature = FeatureExtraction()

        expected_posts = pd.read_csv('cleaned_df.csv', encoding = "ISO-8859-1")
        expected_comments = pd.read_csv('cleaned_comments.csv')

        feature.read_datasets()

        result_posts = feature.post_df
        result_comments = feature.comment_df
        

        expected_rows_posts = len(expected_posts.index)
        result_rows_posts = len(result_posts.index)
        
        expected_rows_comments = len(expected_comments.index)
        result_rows_comments = len(result_comments.index)
  
        self.assertEqual(expected_rows_posts, result_rows_posts)
        self.assertEqual(expected_rows_comments, result_rows_comments)
        
    def test_cmnt_sentiment_column(self):
        feature.filter_comments()
        feature.build_comment_score()
        feature.cmnt_sentiment_column()
        
        feature.get_post_dataset().to_csv('df1.csv')
        result = pd.read_csv('df1.csv', encoding = "ISO-8859-1")
        assert abs(result.iloc[0]['comment_sentiment'] - 0.088095896) <= 0.001,
        'Perform Comment sentiment test failed'

        
        
    def test_post_sentiment_column(self):
        
        feature.post_sentiment_column()
        
        result = feature.get_post_dataset().to_csv('df2.csv')
        result = pd.read_csv('df2.csv', encoding = "ISO-8859-1")
        assert result.iloc[0]['post_sentiment'] == 0
        assert abs(result.iloc[1]['post_sentiment'] - 0.7906) <= 0.001, 
        'Perform Post Sentiment test failed'
            

In [51]:
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored for jupyter'], exit=False)

['86byl8' '796d3z' '6ttusb' '75yogr' 'd0x9x3' '54zzy4' 'atmab5' '8zhu6j'
 'ben0cn' '7erxmm' '9b2qiv' '52q9bj' '6uo9g3' '54jf92' '1egffl' 'd5uvv8'
 '6bvpes' '6qnyqt' 'c62msf' '6kxdds' 'cc8eid' 'btlq8x' '42l6vy' '4ooe12'
 '6rljfc' '96hi5z' '7m9jvd' '5ulz1z' '593joh' '745qsl']


  mask |= (ar1 == a)
.
----------------------------------------------------------------------
Ran 3 tests in 31.701s

OK
