# TF-IDF

In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
dataset_3000 = pd.read_pickle('./dataset/text_3000.pickle')
dataset_300 = pd.read_pickle('./dataset/text_300.pickle')

print(dataset_3000.shape)
print(dataset_300.shape)

(3000, 2)
(300, 2)


In [3]:
dataset_3000.head()

Unnamed: 0,text,label
0,"""logic of empire"" is a science fiction novel b...",1
1,"major general richard hutton davies, (14 nove...",1
2,elgin reptiles is the name given to a group of...,1
3,"dubgaill and finngaill, or dubgenti and finnge...",0
4,chang teh-ming (; born 1938) is a taiwanese ph...,1


In [4]:
vectorizer = TfidfVectorizer()
vectorizer.fit(dataset_3000['text'])
print('vocabulary size: ', len(vectorizer.vocabulary_))

vector_3000 = vectorizer.transform(dataset_3000['text'])
vector_300 = vectorizer.transform(dataset_300['text'])

vocabulary size:  36862


Too many words, the vector is too long, we only use words with minimam frequency 15

In [5]:
vectorizer = TfidfVectorizer(min_df=15)
vectorizer.fit(dataset_3000['text'])
print('vocabulary size: ', len(vectorizer.vocabulary_))

vector_3000 = vectorizer.transform(dataset_3000['text'])
vector_300 = vectorizer.transform(dataset_300['text'])

vocabulary size:  2815


In [6]:
dataset_3000_vector = pd.DataFrame(columns=['vector', 'label'])
dataset_3000_vector['vector'] = list(vector_3000.toarray())
dataset_3000_vector['label'] = dataset_3000['label']

dataset_300_vector = pd.DataFrame(columns=['vector', 'label'])
dataset_300_vector['vector'] = list(vector_300.toarray())
dataset_300_vector['label'] = dataset_300['label']

In [7]:
dataset_3000_vector.to_pickle('./dataset/vector_3000_tfidf.pickle')
dataset_300_vector.to_pickle('./dataset/vector_300_tfidf.pickle')

# Read from pickle

In [8]:
dataset_3000_vector = pd.read_pickle('./dataset/vector_3000_tfidf.pickle')
dataset_300_vector = pd.read_pickle('./dataset/vector_300_tfidf.pickle')

print('train set size: ', len(dataset_3000_vector))
print('test set size: ', len(dataset_300_vector))

print('train set vector size: ', len(dataset_3000_vector['vector'][0]))
print('test set vector size: ', len(dataset_300_vector['vector'][0]))

train set size:  3000
test set size:  300
train set vector size:  2815
test set vector size:  2815


In [9]:
dataset_3000_vector.head()

Unnamed: 0,vector,label
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
1,"[0.0, 0.12388382086309918, 0.0, 0.0, 0.0, 0.0,...",1
2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
3,"[0.0, 0.0, 0.0, 0.10454673097714223, 0.0, 0.0,...",0
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1


# For GPT 3.5

In [10]:
dataset_3000 = pd.read_pickle('./dataset/text_3000_Turbo.pickle')
dataset_300 = pd.read_pickle('./dataset/text_300_Turbo.pickle')

print(dataset_3000.shape)
print(dataset_300.shape)

(3000, 2)
(300, 2)


In [11]:
vectorizer = TfidfVectorizer(min_df=15)
vectorizer.fit(dataset_3000['text'])
print('vocabulary size: ', len(vectorizer.vocabulary_))

vector_3000 = vectorizer.transform(dataset_3000['text'])
vector_300 = vectorizer.transform(dataset_300['text'])

vocabulary size:  4199


In [12]:
dataset_3000_vector = pd.DataFrame(columns=['vector', 'label'])
dataset_3000_vector['vector'] = list(vector_3000.toarray())
dataset_3000_vector['label'] = dataset_3000['label']

dataset_300_vector = pd.DataFrame(columns=['vector', 'label'])
dataset_300_vector['vector'] = list(vector_300.toarray())
dataset_300_vector['label'] = dataset_300['label']

In [13]:
dataset_3000_vector.to_pickle('./dataset/vector_3000_Turbo_tfidf.pickle')
dataset_300_vector.to_pickle('./dataset/vector_300_Turbo_tfidf.pickle')