# 1. Read Dataset

In [2]:
import pandas as pd

In [3]:
# Read all the data from csv file
def read_dataset(file_path):
    names = ['airline_sentiment', 'text']
    data = pd.read_csv(file_path, names = names, header=0)
    return data

In [4]:
def read_train_test_data(dir='Data'):
    train_data = read_dataset(dir + "/train.csv")
    test_data = read_dataset(dir + "/test.csv")
    return train_data, test_data

In [5]:
def sep_text_sentiment(data):
    return data.iloc[:, 1].values.tolist(), data.iloc[:, 0].values.tolist()

# 2. Data Cleaning

### 2.1. Remove hashtags

In [6]:
# Remove the hashtags from the data #xyz
def remove_hashtags(data):
    return [re.sub(r'#\w+ ?', '', text) for text in data]

### 2.2. Remove User mentions

In [7]:
# Remove the user mentions from the data @xyz
def remove_um(data):
    return [re.sub(r'@\w+ ?', '', text) for text in data]

### 2.3. Remove urls

In [8]:
# Remove the urls from the data
def remove_urls(data):
    return [re.sub(r'http\S+', '', text) for text in data]

### 2.4. Remove Stop words

In [9]:
import nltk
import re
nltk.download("stopwords")
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/usama/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
def rem_stop_words(data):
    return [" ".join(text for text in text.split() if text not in stop) for text in data]

### 2.5. Remove Punctuations

In [11]:
def rem_punc(data):
    return [re.sub(r'[^\w\s]', '', text) for text in data]

### 2.6. Remove numbers

In [12]:
def rem_nums(data):
    return [re.sub('\d+', '', text) for text in data]
    

### 2.7 Lower case data 

In [13]:
def lower(data):
    return [text.lower() for text in data]

### 2.8. Split texts to tokens

In [14]:
def tokenize(data):
    return [text.split() for text in data]

### 2.9. Apply all to clean data

In [15]:
def clean_data(data):
    data = remove_hashtags(data)
    data = remove_um(data)
    data = remove_urls(data)
#     data = rem_stop_words(data)
    data = rem_punc(data)
    data = rem_nums(data)
    data = lower(data)
    return data

In [16]:
def tfidf_vec(data):
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    # create the transform
    vectorizer = TfidfVectorizer(max_features=2500, min_df=7, max_df=0.8)
    # Tokenize and build
    return vectorizer.fit_transform(data).toarray()

In [17]:
# def skipgram_w2v(data):
#     from gensim.models import Word2Vec
#     # training the model
#     skipgram = Word2Vec(data, window = 3, min_count=1, sg = 1)
#     return skipgram

In [18]:
# def save_model(model, file='skipgram.bin'):
#     model.save(file)

In [19]:
def get_input():
    return input("Enter Your tweet: ")

In [20]:
def save_model(model, file='finalized_model.sav'):
    import pickle
    pickle.dump(model, open(file, 'wb'))

In [21]:
def load_model(file='finalized_model.sav'):
    import pickle
    return pickle.load(open(file, 'rb'))

In [22]:
def accuracy(y_test, y_pred):
    count = 0.0
    for i in range(len(y_test)):
        if y_pred[i] == y_test[i]:
            count += 1

    return count/len(y_test)

In [23]:
def rand_test(data, X_test, y_pred, features_nd, n):
    import random
    j = random.randint(0,(len(X_test))-n)
    for i in range(j,j+n):
        ind = features_nd.tolist().index(X_test[i].tolist())
        print("Tweet: ", end="")
        print(data[ind])
        print("Prediction: ", end="")
        print(y_pred[i])

In [32]:
def sentiment_(n):
    if n == 0:
        return "Negative"
    elif n == 1:
        return "Positive"
    else:
        return "Netural"

SyntaxError: invalid syntax (<ipython-input-32-24eb9f4846e1>, line 2)

In [25]:
# def main():
#     from sklearn.ensemble import RandomForestClassifier
    
#     # Reading Dataset
#     train_data, test_data = read_train_test_data()
#     train_texts, train_sentiment = sep_text_sentiment(train_data)
#     test_texts, test_sentiment = sep_text_sentiment(test_data)
    
#     # Cleaning Data
#     train_texts = clean_data(train_texts)
#     test_texts = clean_data(test_texts)
    
#     train_skipgram = skipgram_w2v(train_texts)
#     test_skipgram = skipgram_w2v(test_texts)
# #     save_model(train_skipgram)
# #     train_skipgram = load_model()
#     train_skipgram = (tfidf_vec(train_texts))
    
# #     train_skipgram = np.array(train_skipgram)
#     text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
#     text_classifier.fit(train_skipgram, train_sentiment)
    
#     text_classifier.predict("I have no choice but to fly Southwest to Vega")
# #     test_skipgram = (tfidf_vec(test_texts))
# #     text_classifier.fit(test_skipgram, test_sentiment)
# #     predictions = text_classifier.predict(test_skipgram)
    
    
#     from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#     print(confusion_matrix(test_sentiment,predictions))
#     print(classification_report(test_sentiment,predictions))
#     print(accuracy_score(test_sentiment, predictions))

In [30]:
def main():
    # Reading Dataset
    train_data, test_data = read_train_test_data()
    train_texts, train_sentiment = sep_text_sentiment(train_data)
    test_texts, test_sentiment = sep_text_sentiment(test_data)

    # Cleaning Data
    train_texts = clean_data(train_texts)
    test_texts = clean_data(test_texts)

    data =  train_texts + test_texts
    sent = train_sentiment + test_sentiment
    
    # Text Embedding
    from sklearn.feature_extraction.text import HashingVectorizer
    vectorizer = HashingVectorizer(n_features=10000)
    # Fit the data to model
    features = vectorizer.fit_transform(data)
    features_nd = features.toarray() # for easy usage
    
    from sklearn.model_selection import train_test_split
    # Split the data
    X_train, X_test, y_train, y_test  = train_test_split(
            features_nd, 
            sent,
            train_size=0.80, 
            random_state=1234)
    # Train the model 
    ## Uncomment these commented lines if this is the first time to run this program
#     from sklearn.ensemble import RandomForestClassifier
#     text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
#     text_classifier.fit(X=X_train, y=y_train)
    
#     save_model(text_classifier) # Save Model
    
    text_classifier = load_model() # Load Model
    
    y_pred = text_classifier.predict(X_test)
    print("Testing Accuracy: ", end="")
    print(accuracy(y_test, y_pred), end="\n\n")
    
    # Random Test
#     rand_test(data, X_test, y_pred, features_nd, 5)
    
    # Test From Input
    while True:
        text = get_input()
        if text == '0':
            break
        text = clean_data([text])
#         print(text)
        vec = vectorizer.fit_transform(text)
        pred = text_classifier.predict(vec)[0]
        
        
        print(sentiment_(pred))


In [31]:
if __name__ == "__main__":
    main()

Testing Accuracy: 0.7359972677595629

Enter Your tweet: This is amazing
Positive
Enter Your tweet: We're still here guys  you are the worst
Negative
Enter Your tweet: Please dont trivialize me  this is a joke not a slow day or slow experience
Negative
Enter Your tweet: I have a party of  booked for a flight in aug at am how can i find out how much it would cost to change to an earlier flight
Negative
Enter Your tweet: Thank you very much
Positive
Enter Your tweet: I love it
Positive
Enter Your tweet: Hi I have a travel question could you, please follow me so I can DM you
Netural
Enter Your tweet: He is very bad boy
Negative
Enter Your tweet: Thanks
Positive
Enter Your tweet: No
Negative
Enter Your tweet: All good about her
Positive
Enter Your tweet: 0
