In [22]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from imutils import paths
import numpy as np
import argparse
import imutils
import pandas as pd
import cv2
import os
from tqdm import tqdm

def image_to_feature_vector(image, size=(32, 32)):
	# resize the image to a fixed size, then flatten the image into
	# a list of raw pixel intensities
	return cv2.resize(image, size).flatten()

def extract_color_histogram(image, bins=(8, 8, 8)):
	# extract a 3D color histogram from the HSV color space using
	# the supplied number of `bins` per channel
	hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
	hist = cv2.calcHist([hsv], [0, 1, 2], None, bins,
		[0, 180, 0, 256, 0, 256])

	# handle normalizing the histogram if we are using OpenCV 2.4.X
	if imutils.is_cv2():
		hist = cv2.normalize(hist)

	# otherwise, perform "in place" normalization in OpenCV 3 (I
	# personally hate the way this is done
	else:
		cv2.normalize(hist, hist)

	# return the flattened histogram as the feature vector
	return hist.flatten()

In [23]:
print("[INFO] describing images...")
data_path='/home/eric/Documents/Hashtag-recommendation-for-social-images/image_text_hashtagging/datasets/image_text/preprocessed_data/'
training_filename = data_path + 'training_data.txt'
image_path='/home/eric/data/social_images'
print('Loading training dataset...')
train_data = pd.read_table(training_filename, delimiter='*')
train_data.drop(columns=['tweets'],inplace=True)
print(train_data.iloc[0,1])
print(train_data.head())
train_data = train_data.values.tolist()
# train_data = np.array(train_data,dtype=str) //problem on transform numpy
print(train_data[0][1])
# imagePaths = list(paths.list_images(args["dataset"]))
print(len(train_data))

# initialize the raw pixel intensities matrix, the features matrix,
# and labels list
rawImages = []
features = []
labels = []

[INFO] describing images...
Loading training dataset...
me night fun happy coffee love amazing cute day likeforfollow likeforlikes woman instagood mood white black
                                       image_names  \
0          dataset/fun/2018-12-28_09-41-29_UTC.jpg   
1  dataset/goodmorning/2018-12-25_17-20-22_UTC.jpg   
2         dataset/baby/2018-12-25_07-10-14_UTC.jpg   
3      dataset/wedding/2019-01-02_22-01-00_UTC.jpg   
4    dataset/instafood/2018-12-21_15-26-39_UTC.jpg   

                                            hashtags  
0  me night fun happy coffee love amazing cute da...  
1  christmas hiphop music video swag photo follow...  
2                     pink kitten cute babygirl baby  
3  shoes streetwear streetart streetstyle streetp...  
4  food foodporn instafood foodie eeeeeats nomnom...  
me night fun happy coffee love amazing cute day likeforfollow likeforlikes woman instagood mood white black
51172


In [24]:
for img_file,hashatgs in tqdm(train_data):

    img_file_path=os.path.join(image_path,img_file)
    image = cv2.imread(img_file_path)
    pixels = image_to_feature_vector(image)
    # print(img_file)
    # print(hashatgs)
    list_hashtag=hashatgs.strip().split()
    rawImages.append(pixels)
    labels.append(list_hashtag)

100%|██████████| 51172/51172 [14:11<00:00, 60.12it/s]


In [25]:
print(labels[:10])

[['me', 'night', 'fun', 'happy', 'coffee', 'love', 'amazing', 'cute', 'day', 'likeforfollow', 'likeforlikes', 'woman', 'instagood', 'mood', 'white', 'black'], ['christmas', 'hiphop', 'music', 'video', 'swag', 'photo', 'follow', 'goodmorning', 'morning', 'love', 'california', 'rap', 'green'], ['pink', 'kitten', 'cute', 'babygirl', 'baby'], ['shoes', 'streetwear', 'streetart', 'streetstyle', 'streetphotography', 'new', 'beauty', 'hair', 'pretty', 'fashion', 'happiness', 'love', 'yummy', 'model', 'instagood', 'makeup', 'kpop', 'weddingdress', 'wine', 'wedding', 'music', 'dress', 'nails', 'life', 'goodmorning', 'goodnight'], ['food', 'foodporn', 'instafood', 'foodie', 'eeeeeats', 'nomnom', 'foodstagram', 'foodgram', 'yummy', 'foodblogger', 'foodphotography', 'foodgasm'], ['weddingday', 'canon', 'amazing', 'art', 'artistic', 'colors', 'cool', 'instaphoto', 'instapic', 'photo', 'photograph', 'photography', 'photooftheday', 'photos', 'photoshop', 'picoftheday', 'fashion'], ['cute', 'awesome',

In [26]:
import h5py
f_out = h5py.File("train.h5", "w")
for i in range(len(train_data)):
    f_out.create_dataset(name=train_data[i][0], data=rawImages[i])

In [27]:
f_out.close()

In [5]:
rawImages = np.array(rawImages)
features = np.array(features)
labels = np.array(labels)

In [6]:
print("[INFO] pixels matrix: {:.2f}MB".format(
	rawImages.nbytes / (1024 * 1000.0)))
print("[INFO] features matrix: {:.2f}MB".format(
	features.nbytes / (1024 * 1000.0)))

[INFO] pixels matrix: 112.50MB
[INFO] features matrix: 75.00MB


In [7]:
test_size=0.1
(trainRI, testRI, trainRL, testRL) = train_test_split(
	rawImages, labels, test_size=test_size, random_state=42)
(trainFeat, testFeat, trainLabels, testLabels) = train_test_split(
	features, labels, test_size=test_size, random_state=42)

In [13]:
print(testRI[:100])

[[113 100  45 ...  12  60  64]
 [  8  11  12 ... 124  89  60]
 [169 173 174 ...  38  73  87]
 ...
 [ 79 113 136 ... 190 191 187]
 [ 89  97 114 ...  52  57  55]
 [ 20 127 160 ...  74 113 127]]
