In [1]:
import csv
import pandas as pd
import math
import numpy as np
import datetime
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif
import matplotlib.pyplot as plt

In [126]:
def mapDayToNumber(day) :
    dayOfWeeks = ["Saturday", "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]
    for i in range(len(dayOfWeeks)) : 
        if (dayOfWeeks[i] == day) : 
            return i+1
    return 0
def mapTimeToNumber(time) :
    times = ["12AM", "01AM", "02AM", "03AM", "04AM", "05AM", "06AM", "07AM", "08AM", "09AM", "10AM", "11AM", "12PM", "01PM", "02PM", "03PM", "04PM", "05PM", "06PM", "07PM", "08PM", "09PM", "10PM", "11PM"]
    for i in range(len(times)) : 
        if (times[i] == time) : 
            return i
    return 0
    
def fetchDay(createTime)  :
    [dayOfWeek, time] = createTime.split(' ')
    return mapDayToNumber(dayOfWeek)

def fetchTime(createTime) : 
    [dayOfWeek, time] = createTime.split(' ')
    return mapTimeToNumber(time)

def correctThePrice(oldPrice) : 
    newPrice = oldPrice
    if (4500 < oldPrice < 10000) : 
        newPrice = -11
#         newPrice = oldPrice * 10
    
    elif (10 <= oldPrice <= 4500) : 
        newPrice = oldPrice * 1000
        
    elif (0 < oldPrice < 10) : 
        newPrice = -11
#         newPrice = oldPrice * 1000000
    
    return newPrice

def getAveragePricesPerBrands(df, brands) : 
    sums = { i : 0 for i in brands }
    repeats = { i : 0 for i in brands }
    avgs = { i : 0 for i in brands }
    for index, dfRow in df.iterrows() :
        price = dfRow['price']
        if (price != -1) : 
            sums[dfRow['brand']] += price
            repeats[dfRow['brand']] += 1
    for brand in brands : 
        avgs[brand] = int(sums[brand] / repeats[brand])
    return avgs
        
    

In [131]:
def getProcessedData(dataFile, brandEncType = "onehot", cityEncType = "onehot") : 
    data = []
    targetData = []
    col_list = ["brand", "city", "title", "desc", "image_count", "created_at", "price"]
    df = pd.read_csv(dataFile, usecols=col_list)
    
    print("preprocessing started...")
    
    if (brandEncType == "onehot") :
        df = pd.concat([df,pd.get_dummies(df['brand'], prefix = "brand")],axis=1)
    
    print("brand onehot encoding done...")
        
    if (cityEncType == "onehot") :
        df = pd.concat([df,pd.get_dummies(df['city'], prefix = "city")],axis=1)
        
    print("city onehot encoding done...")
    
    df['dayOfweek'] = df.apply(lambda row : fetchDay(row['created_at']), axis = 1) 
    df['time'] = df.apply(lambda row : fetchTime(row['created_at']), axis = 1) 
    del df['created_at']
    
    print("time preprocessing done...")
    
    df['price'] = df.apply(lambda row : correctThePrice(row['price']), axis = 1)
    df = df.loc[df['price'] > -10]
    
    brands = df.brand.unique()
    avgPrices = getAveragePricesPerBrands(df, brands)
    
    print("price correction done, average price per every brand calculated...")
    
    
    
    
    
    
    for index, dfRow in df.iterrows() :
        price = dfRow['price']
        if (price == -1) : 
            price = avgPrices[dfRow['brand']]
        targetData.append(price)
        del dfRow['price']
        del dfRow['brand']
        del dfRow['city']  
        data.append(dfRow)
    
    labels = list(df)
    labels.remove('price')
    labels.remove('brand')
    labels.remove('city')
    return [data, targetData, labels]

In [133]:
[data, targetData, labels] = getProcessedData("mobile_phone_dataset.csv")

# informationGains = mutual_info_classif(data, targetData)

{'Nokia::نوکیا': 206977, 'Apple::اپل': 1199415, 'Samsung::سامسونگ': 670717, 'Huawei::هوآوی': 353506, 'LG::ال\u200cجی': 587703, 'ZTE::زدتی\u200cای': 400267, 'Sony::سونی': 481580, 'HTC::اچ\u200cتی\u200cسی': 474667, 'Lenovo::لنوو': 345989}
title                                                            سامسونگ j5
desc                      گوشى بسیار بسیار تمیز و فقط سه هفته کارکرده و ...
image_count                                                               2
brand_Apple::اپل                                                          0
brand_HTC::اچ‌تی‌سی                                                       0
brand_Huawei::هوآوی                                                       0
brand_LG::ال‌جی                                                           0
brand_Lenovo::لنوو                                                        0
brand_Nokia::نوکیا                                                        0
brand_Samsung::سامسونگ                                                    1
bra