https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/

In [1]:
from sklearn import model_selection,preprocessing,linear_model,naive_bayes,svm,metrics
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn import decomposition,ensemble

  return f(*args, **kwds)


In [3]:
import pandas,xgboost,numpy,textblob,string
from keras.preprocessing  import text,sequence
from keras import layers,models, optimizers

# 1. Data Loading

In [11]:
data = open('corpus.txt',encoding="utf8").read()
labels, texts = [], []

In [13]:
data[:100]

'__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in y'

In [14]:
for i, line in enumerate(data.split("\n")):
    content = line.split()
    labels.append(content[0])
    texts.append(" ".join(content[1:]))

In [15]:
labels[:5]

['__label__2', '__label__2', '__label__2', '__label__2', '__label__2']

In [16]:
# create a dataframe using texts and lables
trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

In [17]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

# label encode the target variable 

In [22]:
encoder = preprocessing.LabelEncoder()
train_y= encoder.fit_transform(train_y)
valid_y=encoder.fit_transform(valid_y)

In [24]:
train_y

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

# 2. Feature Engineering

2.1 Count Vectors as features: 

Count Vector is a matrix notation of the dataset in which every row represents a document from the corpus, every column represents a term from the corpus, and every cell represents the frequency count of a particular term in a particular document.


In [25]:

trainDF['text'].head()

0    Stuning even for the non-gamer: This sound tra...
1    The best soundtrack ever to anything.: I'm rea...
2    Amazing!: This soundtrack is my favorite music...
3    Excellent Soundtrack: I truly like this soundt...
4    Remember, Pull Your Jaw Off The Floor After He...
Name: text, dtype: object

In [26]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
        vocabulary=None)

In [31]:
# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

2.2 TF-IDF Vectors as features

TF-IDF score is composed by two terms: the first computes the normalized Term Frequency (TF), the second term is the Inverse Document Frequency (IDF), computed as the logarithm of the number of the documents in the corpus divided by the number of documents where the specific term appears.

TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)

IDF(t) = log_e(Total number of documents / Number of documents with term t in it)

In [35]:
# word level tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [36]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

In [37]:
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

# 2.3 Word Embeddings

A word embedding is a form of representing words and documents using a dense vector representation. The position of a word within the vector space is learned from text and is based on the words that surround the word when it is used.Word embeddings can be trained using the input corpus itself or can be generated using pre-trained word embeddings such as Glove, FastText, and Word2Vec. Any one of them can be downloaded and used as transfer learning. One can read more about word embeddings here (https://www.analyticsvidhya.com/blog/2017/06/word-embeddings-count-word2veec/).

In [49]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}

data=open('glove.6B.100d.txt',encoding="utf8")


# for i, line in enumerate(open('wiki-news-300d-1M.vec',encoding="utf8", errors='replace')):
#     values = line.split()
#     embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')


In [53]:
for i, line in enumerate(data):
    values = line.split()    
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

In [55]:
# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(trainDF['text'])
word_index = token.word_index

In [56]:
# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)


In [62]:
# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    print(embedding_vector.shape)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

(100,)


ValueError: could not broadcast input array from shape (100) into shape (300)