In [2]:
!pip install nltk



In [3]:
!pip install xgboost



In [None]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from pathlib import Path
import os
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import time
import joblib
import warnings
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [13]:
class Data_preprocessing:
    
    VECTORIZER_DIR = Path.cwd()/'VECTORIZER'
    warnings.filterwarnings(action = 'ignore')
    
    @staticmethod
    def data_validation(path):
        try:
            if open(path, 'rb'):
                print('_________________________________________________________')
                print('Path is correct')
                print('I m starting process of data preprocessing...')
                return Data_preprocessing(path)
        except:
            print('_________________________________________________________')
            print("Something was wrong....")
    
    def __init__(self, path, VECTORIZER_DIR = VECTORIZER_DIR):
        
        self.df = None
        self.path = path
        self.num_class = None
        self.vector = None
        self.DATA_DIR = None
        self.VECTORIZER_DIR = VECTORIZER_DIR
        
        self.control_function()
        
    
    
    def data_load(self):
        self.df = pd.read_csv(self.path, sep = ',', header = 0, encoding = "ISO-8859-1")
        
    def data_evaluate(self, df):
        num_columns = len(df.columns)-1
        num_records = len(df.index)
        missing_value = df.isnull().sum().sum()
        self.num_class = len(df['Label'].unique())
        print('_________________________________________________________')
        print('DataFrame contain {0} records and {1} features'.format(num_records, num_columns))
        print('DataFrame contain {0} missing value'.format(missing_value))
        print('DataFrame contain {0} class to predict'.format(self.num_class))  

        if df.isnull().sum().sum() != 0 and df.isnull().sum().sum()/len(df.index) <= 0.01:
            print('There is missing less than 1% of entire data. Im starting process to remove missing records')
            df = df.dropna(axis = 0)
            print('DataFrame contain {0} missing value'.format(df.isnull().sum().sum()))
        return df

    @staticmethod
    def data_preprocessing(df, is_it_prediction = False):
        if is_it_prediction == False:
            
            df = df.drop(labels = 'Date', axis = 1)
            df = df.reset_index(drop = True)
            new_df = pd.DataFrame(np.zeros(shape = (len(df.index), 2)), columns = ['Label', 'Headers'])
            new_df['Label'] = df['Label']
            
            for i in range(len(df.index)):
                row = ''
                for column in df.columns[1:]:
                    row += df.loc[i,column]
                    row += '. ' 
                new_df.iloc[i,1] = row
            return new_df   
        
        
        else:
            new_df = pd.DataFrame(np.zeros(shape = (len(df.index), 1)), columns = ['Headers'])
            for i in range(len(df.index)):
                row = ''
                for column in df.columns:
                    row += df.loc[i,column]
                    row += '. ' 
                new_df.iloc[i,0] = row
            return new_df  

    @staticmethod
    def text_preprocessing(df, is_it_prediction = False):
        if is_it_prediction == False:
            df['Headers'] = df['Headers'].apply(lambda x: re.sub('[^a-zA-Z]'," ", x.lower()))
            df['Headers'] = df['Headers'].apply(lambda x: re.sub('\d', "", x))
            df['Headers'] = df['Headers'].apply(lambda x: re.sub('\s+', " ", x))

            for index in df.index:
                tokens = nltk.word_tokenize(df.iloc[index,1])
                tokens = [word for word in tokens if word not in stopwords.words('english')]

                df.iloc[index, 1] = ' '.join(tokens)
            return df
        
        else:
            df['Headers'] = df['Headers'].apply(lambda x: re.sub('[^a-zA-Z]'," ", x.lower()))
            df['Headers'] = df['Headers'].apply(lambda x: re.sub('\d', "", x))
            df['Headers'] = df['Headers'].apply(lambda x: re.sub('\s+', " ", x))

            for index in df.index:
                tokens = nltk.word_tokenize(df.iloc[index,0])
                tokens = [word for word in tokens if word not in stopwords.words('english')]

                df.iloc[index, 0] = ' '.join(tokens)
            return df
        
    def take_vector(self, df, feature_columns = 1):
        
        if self.VECTORIZER_DIR.exists() == False:
            os.mkdir(self.VECTORIZER_DIR)
        
        else:
            os.remove(self.VECTORIZER_DIR/'VECTORIZER.pkl')    
        
        vectorizer = CountVectorizer()
        vector = vectorizer.fit_transform(df.iloc[:,feature_columns]).toarray()
        with open(self.VECTORIZER_DIR/'VECTORIZER.pkl', 'wb') as file:
            pickle.dump(vectorizer.vocabulary_, file)
            
        return vector
    
    
    def data_split(self,df, target_column = 0):
        X = df[:,1:]
        y = df[:,target_column]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

        cwd = Path.cwd()
        DATA_DIR = cwd/'DATASET'

        def save_data():  
            print('_________________________________________________________')
            print('''I'm saving data...''')
            with open(DATA_DIR/'X_train.pkl', 'wb') as file:
                pickle.dump(X_train, file)
            with open(DATA_DIR/'y_train.pkl', 'wb') as file:
                pickle.dump(y_train, file)
            with open(DATA_DIR/'X_test.pkl', 'wb') as file:
                pickle.dump(X_test, file)
            with open(DATA_DIR/'y_test.pkl','wb') as file:
                pickle.dump(y_test, file)

        if DATA_DIR.exists() == True:
            os.remove(DATA_DIR/'X_train.pkl')
            os.remove(DATA_DIR/'y_train.pkl')
            os.remove(DATA_DIR/'X_test.pkl')
            os.remove(DATA_DIR/'y_test.pkl')
            os.rmdir(DATA_DIR)

            os.mkdir(DATA_DIR)
            save_data()
        else:
            os.mkdir(DATA_DIR)
            save_data()
        
        return DATA_DIR
    
    @staticmethod
    def predict(X):
        X = pd.DataFrame(X)
        X = Data_preprocessing.data_preprocessing(X, is_it_prediction = True)
        X.columns = ['Headers']
        X = Data_preprocessing.text_preprocessing(X, is_it_prediction = True)
        
        vectorizer = CountVectorizer(decode_error = 'replace',
                                     vocabulary = pickle.load(open(Data_preprocessing.VECTORIZER_DIR/'VECTORIZER.pkl', 'rb')))       
        X = vectorizer.transform(X).toarray()
        return X
    
    
    def control_function(self):
        start = time.time()
        
        self.data_load()
        self.df = self.data_evaluate(self.df)
        self.df = Data_preprocessing.data_preprocessing(self.df)
        self.df = Data_preprocessing.text_preprocessing(self.df)
        self.vector = self.take_vector(self.df)
        self.DATA_DIR = self.data_split(self.vector)
        
        stop = time.time()
        total = stop - start
        total = round(total, ndigits = 2)
        
        print('_________________________________________________________')
        print('Process of data preprocessing took {0} seconds'.format(total))
        


In [None]:
Data_preprocessing.data_validation(r"A:\ML\NLP\stock_analysis\Data.csv")

_________________________________________________________
Path is correct
I m starting process of data preprocessing...
_________________________________________________________
DataFrame contain 4101 records and 26 features
DataFrame contain 7 missing value
DataFrame contain 2 class to predict
There is missing less than 1% of entire data. Im starting process to remove missing records
DataFrame contain 0 missing value


In [7]:
test = np.array(pd.read_csv('Data.csv', sep = ',', header = 0, encoding = "ISO-8859-1", nrows = 1))[0,2:].reshape(1,-1)

In [8]:
x = Data_preprocessing.predict(test)

In [11]:
class Model_build(Data_preprocessing):
    
    DATA_DIR = Path.cwd()/'DATASET'
    
    @staticmethod
    def validation():
        if Model_build.DATA_DIR.exists() == True:
            Model_build()
        else:
            print('There is no data to train')
            
    
    def __init__(self, DATA_DIR = DATA_DIR):
        
        self.model = None
        self.DATA_DIR = DATA_DIR
        self.data_dict = {}
        
        self.control_function()
    
    def load_data(self):
        print('_________________________________________________________')
        print('''I'm loading data...''')
        elements = os.listdir(path = self.DATA_DIR)
        
        for i in elements:
            with open(self.DATA_DIR/i, 'rb') as file:
                data = pickle.load(file)
            self.data_dict.update({i:data})
    
    def train_model(self):
        print('_________________________________________________________')
        print('''I'm training model...''')
        
        self.model = XGBClassifier()
        self.model.fit(self.data_dict['X_train.pkl'], self.data_dict['y_train.pkl'])
        
        y_pred = self.model.predict(self.data_dict['X_test.pkl'])
        
        temp_accuracy = accuracy_score(self.data_dict['y_test.pkl'], y_pred)
        temp_precision = precision_score(self.data_dict['y_test.pkl'], y_pred)
        temp_recall = recall_score(self.data_dict['y_test.pkl'], y_pred)
        
        print('Initial validation:')
        time.sleep(4)
        print('Accuracy = {0}'.format(round(temp_accuracy, ndigits = 2) * 100))
        print('Precision = {0}'.format(round(temp_precision, ndigits = 2) * 100))
        print('Recall = {0}'.format(round(temp_recall, ndigits = 2) * 100))


        
    def control_function(self):
        start = time.time()
        self.load_data()
        #self.train_model()
        
        
        stop = time.time()
        total = stop - start
        total = round(total, ndigits = 2)
        print('_________________________________________________________')
        print('Training process took {0} seconds'.format(total))

In [12]:
Model_build.validation()

_________________________________________________________
I'm loading data...
_________________________________________________________
Training process took 0.01 seconds
