# 1. Read Dataset

In [1]:
import pandas as pd

In [2]:
# Read all the data from csv file
def read_dataset(file_path):
    names = ['airline_sentiment', 'text']
    data = pd.read_csv(file_path, names = names, header=0)
    return data

In [3]:
def read_train_test_data(dir='Data'):
    train_data = read_dataset(dir + "/train.csv")
    test_data = read_dataset(dir + "/test.csv")
    return train_data, test_data

In [4]:
def sep_text_sentiment(data):
    return data.iloc[:, 1].values.tolist(), data.iloc[:, 0].values.tolist()

# 2. Data Cleaning

### 2.1. Remove hashtags

In [5]:
# Remove the hashtags from the data #xyz
def remove_hashtags(data):
    return [re.sub(r'#\w+ ?', '', text) for text in data]

### 2.2. Remove User mentions

In [6]:
# Remove the user mentions from the data @xyz
def remove_um(data):
    return [re.sub(r'@\w+ ?', '', text) for text in data]

### 2.3. Remove urls

In [7]:
# Remove the urls from the data
def remove_urls(data):
    return [re.sub(r'http\S+', '', text) for text in data]

### 2.4. Remove Stop words

In [8]:
import nltk
import re
nltk.download("stopwords")
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/usama/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
def rem_stop_words(data):
    return [" ".join(text for text in text.split() if text not in stop) for text in data]

### 2.5. Remove Punctuations

In [10]:
def rem_punc(data):
    return [re.sub(r'[^\w\s]', '', text) for text in data]

### 2.6. Remove numbers

In [11]:
def rem_nums(data):
    return [re.sub('\d+', '', text) for text in data]
    

### 2.7 Lower case data 

In [12]:
def lower(data):
    return [text.lower() for text in data]

### 2.8. Split texts to tokens

In [13]:
def bow(data):
    return [text.split() for text in data]

### 2.9. Apply all to clean data

In [14]:
def clean_data(data):
    data = remove_hashtags(data)
    data = remove_um(data)
    data = remove_urls(data)
    data = rem_stop_words(data)
    data = rem_punc(data)
    data = rem_nums(data)
    data = lower(data)
#     data = bow(data)
    return data

In [15]:
def tfidf_vec(data):
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    # create the transform
    vectorizer = TfidfVectorizer(max_features=2500, min_df=7, max_df=0.8)
    # Tokenize and build
    return vectorizer.fit_transform(data).toarray()

In [16]:
# def skipgram_w2v(data):
#     from gensim.models import Word2Vec
#     # training the model
#     skipgram = Word2Vec(data, window = 3, min_count=1, sg = 1)
#     return skipgram

In [17]:
# def save_model(model, file='skipgram.bin'):
#     model.save(file)

In [18]:
# def load_model(file='skipgram.bin'):
#     return Word2Vec.load(file)

In [19]:
def save_model(model, file='finalized_model.sav'):
    import pickle
    pickle.dump(model, open(file, 'wb'))

In [20]:
def load_model(file='finalized_model.sav'):
    import pickle
    return pickle.load(open(file, 'rb'))

In [366]:
def main():
    from sklearn.ensemble import RandomForestClassifier
    
    # Reading Dataset
    train_data, test_data = read_train_test_data()
    train_texts, train_sentiment = sep_text_sentiment(train_data)
    test_texts, test_sentiment = sep_text_sentiment(test_data)
    
    # Cleaning Data
    train_texts = clean_data(train_texts)
    test_texts = clean_data(test_texts)
    
    train_skipgram = skipgram_w2v(train_texts)
    test_skipgram = skipgram_w2v(test_texts)
#     save_model(train_skipgram)
#     train_skipgram = load_model()
    train_skipgram = (tfidf_vec(train_texts))
    
#     train_skipgram = np.array(train_skipgram)
    text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
    text_classifier.fit(train_skipgram, train_sentiment)
    
    text_classifier.predict("I have no choice but to fly Southwest to Vega")
#     test_skipgram = (tfidf_vec(test_texts))
#     text_classifier.fit(test_skipgram, test_sentiment)
#     predictions = text_classifier.predict(test_skipgram)
    
    
    from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

    print(confusion_matrix(test_sentiment,predictions))
    print(classification_report(test_sentiment,predictions))
    print(accuracy_score(test_sentiment, predictions))

In [367]:
if __name__ == "__main__":
    main()

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


KeyboardInterrupt: 

In [59]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# # Reading Dataset
train_data, test_data = read_train_test_data()
train_texts, train_sentiment = sep_text_sentiment(train_data)
test_texts, test_sentiment = sep_text_sentiment(test_data)

# Cleaning Data
train_texts = clean_data(train_texts)
test_texts = clean_data(test_texts)

# train_skipgram = skipgram_w2v(train_texts)
# test_skipgram = skipgram_w2v(test_texts)
# train_skipgram = tfidf_vec(train_texts)
# train_skipgram = load_model()

# Train the model
text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(train_skipgram.reshape(-1, 1), train_sentiment)
# save_model(text_classifier)


# text_classifier.predict("I have no choice but to fly Southwest to Vega")
test_skipgram = tfidf_vec(test_texts)
# text_classifier.fit(test_skipgram, test_sentiment)
# predictions = text_classifier.predict(test_skipgram)

# print(confusion_matrix(test_sentiment,predictions))
# print(classification_report(test_sentiment,predictions))
# print(accuracy_score(test_sentiment, predictions))

In [70]:
model = load_model()
# test_skipgram = model.transform(test_skipgram)
new_tfidf = model.transform(test_skipgram)
# test_skipgram = (tfidf_vec(test_skipgram))
predictions = model.predict(test_skipgram)

AttributeError: 'RandomForestClassifier' object has no attribute 'transform'

In [98]:
from sklearn.feature_extraction.text import CountVectorizer

data = train_data.append(test_data)
sent = train_sentiment.append(test_sentiment)
vectorizer = CountVectorizer()

features = vectorizer.fit_transform(data)
features_nd = features.toarray() # for easy usage

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test  = train_test_split(
        features_nd, 
        sent,
        train_size=0.80, 
        random_state=1234)


TypeError: 'NoneType' object is not subscriptable

In [99]:
features_nd

array([[1, 0],
       [0, 1]])